aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2013-11-14 10:29:01 -0500
committerJens Axboe <axboe@kernel.dk>2013-11-14 10:29:01 -0500
commit1355b37f111b35cd6f53078ce63997aec473629f (patch)
tree590ecf1b148fc631336213a956d8456ce85bdc42
parentf618ef7c47934d1686a764d0c9f70f23e566683f (diff)
parentc86949486d41d9e7d7681fc72923555114fd702f (diff)
Merge branch 'for-3.13/post-mq-drivers' into for-linus
-rw-r--r--Documentation/blockdev/floppy.txt6
-rw-r--r--drivers/block/Kconfig12
-rw-r--r--drivers/block/Makefile2
-rw-r--r--drivers/block/cciss.c2
-rw-r--r--drivers/block/drbd/drbd_int.h3
-rw-r--r--drivers/block/drbd/drbd_main.c19
-rw-r--r--drivers/block/drbd/drbd_nl.c6
-rw-r--r--drivers/block/drbd/drbd_receiver.c45
-rw-r--r--drivers/block/drbd/drbd_req.c3
-rw-r--r--drivers/block/loop.c15
-rw-r--r--drivers/block/mg_disk.c2
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c500
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h18
-rw-r--r--drivers/block/pktcdvd.c22
-rw-r--r--drivers/block/rsxx/core.c8
-rw-r--r--drivers/block/rsxx/dev.c8
-rw-r--r--drivers/block/rsxx/dma.c119
-rw-r--r--drivers/block/rsxx/rsxx_priv.h11
-rw-r--r--drivers/block/skd_main.c5432
-rw-r--r--drivers/block/skd_s1120.h330
-rw-r--r--drivers/block/xen-blkback/blkback.c3
-rw-r--r--drivers/block/xen-blkfront.c159
-rw-r--r--drivers/md/bcache/Kconfig11
-rw-r--r--drivers/md/bcache/alloc.c383
-rw-r--r--drivers/md/bcache/bcache.h327
-rw-r--r--drivers/md/bcache/bset.c289
-rw-r--r--drivers/md/bcache/bset.h93
-rw-r--r--drivers/md/bcache/btree.c1396
-rw-r--r--drivers/md/bcache/btree.h195
-rw-r--r--drivers/md/bcache/closure.c103
-rw-r--r--drivers/md/bcache/closure.h183
-rw-r--r--drivers/md/bcache/debug.c185
-rw-r--r--drivers/md/bcache/debug.h50
-rw-r--r--drivers/md/bcache/journal.c293
-rw-r--r--drivers/md/bcache/journal.h52
-rw-r--r--drivers/md/bcache/movinggc.c87
-rw-r--r--drivers/md/bcache/request.c1102
-rw-r--r--drivers/md/bcache/request.h43
-rw-r--r--drivers/md/bcache/stats.c26
-rw-r--r--drivers/md/bcache/stats.h13
-rw-r--r--drivers/md/bcache/super.c190
-rw-r--r--drivers/md/bcache/sysfs.c42
-rw-r--r--drivers/md/bcache/trace.c1
-rw-r--r--drivers/md/bcache/util.c12
-rw-r--r--drivers/md/bcache/util.h15
-rw-r--r--drivers/md/bcache/writeback.c455
-rw-r--r--drivers/md/bcache/writeback.h46
-rw-r--r--drivers/s390/block/dasd.c8
-rw-r--r--include/trace/events/bcache.h47
-rw-r--r--include/uapi/linux/bcache.h373
50 files changed, 9366 insertions, 3379 deletions
diff --git a/Documentation/blockdev/floppy.txt b/Documentation/blockdev/floppy.txt
index 470fe4b5e379..e2240f5ab64d 100644
--- a/Documentation/blockdev/floppy.txt
+++ b/Documentation/blockdev/floppy.txt
@@ -39,15 +39,15 @@ Module configuration options
39============================ 39============================
40 40
41 If you use the floppy driver as a module, use the following syntax: 41 If you use the floppy driver as a module, use the following syntax:
42modprobe floppy <options> 42modprobe floppy floppy="<options>"
43 43
44Example: 44Example:
45 modprobe floppy omnibook messages 45 modprobe floppy floppy="omnibook messages"
46 46
47 If you need certain options enabled every time you load the floppy driver, 47 If you need certain options enabled every time you load the floppy driver,
48you can put: 48you can put:
49 49
50 options floppy omnibook messages 50 options floppy floppy="omnibook messages"
51 51
52in a configuration file in /etc/modprobe.d/. 52in a configuration file in /etc/modprobe.d/.
53 53
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 4682546c5da7..1b84778e9bbd 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -110,7 +110,7 @@ source "drivers/block/mtip32xx/Kconfig"
110 110
111config BLK_CPQ_DA 111config BLK_CPQ_DA
112 tristate "Compaq SMART2 support" 112 tristate "Compaq SMART2 support"
113 depends on PCI && VIRT_TO_BUS 113 depends on PCI && VIRT_TO_BUS && 0
114 help 114 help
115 This is the driver for Compaq Smart Array controllers. Everyone 115 This is the driver for Compaq Smart Array controllers. Everyone
116 using these boards should say Y here. See the file 116 using these boards should say Y here. See the file
@@ -319,6 +319,16 @@ config BLK_DEV_NVME
319 To compile this driver as a module, choose M here: the 319 To compile this driver as a module, choose M here: the
320 module will be called nvme. 320 module will be called nvme.
321 321
322config BLK_DEV_SKD
323 tristate "STEC S1120 Block Driver"
324 depends on PCI
325 depends on 64BIT
326 ---help---
327 Saying Y or M here will enable support for the
328 STEC, Inc. S1120 PCIe SSD.
329
330 Use device /dev/skd$N amd /dev/skd$Np$M.
331
322config BLK_DEV_OSD 332config BLK_DEV_OSD
323 tristate "OSD object-as-blkdev support" 333 tristate "OSD object-as-blkdev support"
324 depends on SCSI_OSD_ULD 334 depends on SCSI_OSD_ULD
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 03b3b4a2bd8a..8cc98cd0d4a8 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o
23obj-$(CONFIG_MG_DISK) += mg_disk.o 23obj-$(CONFIG_MG_DISK) += mg_disk.o
24obj-$(CONFIG_SUNVDC) += sunvdc.o 24obj-$(CONFIG_SUNVDC) += sunvdc.o
25obj-$(CONFIG_BLK_DEV_NVME) += nvme.o 25obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
26obj-$(CONFIG_BLK_DEV_SKD) += skd.o
26obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o 27obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o
27 28
28obj-$(CONFIG_BLK_DEV_UMEM) += umem.o 29obj-$(CONFIG_BLK_DEV_UMEM) += umem.o
@@ -44,4 +45,5 @@ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
44obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o 45obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o
45 46
46nvme-y := nvme-core.o nvme-scsi.o 47nvme-y := nvme-core.o nvme-scsi.o
48skd-y := skd_main.o
47swim_mod-y := swim.o swim_asm.o 49swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index edfa2515bc86..0c004ac05811 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -5183,7 +5183,7 @@ reinit_after_soft_reset:
5183 rebuild_lun_table(h, 1, 0); 5183 rebuild_lun_table(h, 1, 0);
5184 cciss_engage_scsi(h); 5184 cciss_engage_scsi(h);
5185 h->busy_initializing = 0; 5185 h->busy_initializing = 0;
5186 return 1; 5186 return 0;
5187 5187
5188clean4: 5188clean4:
5189 cciss_free_cmd_pool(h); 5189 cciss_free_cmd_pool(h);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 2d7f608d181c..0e06f0c5dd1e 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1474,7 +1474,8 @@ enum determine_dev_size {
1474 DS_ERROR = -1, 1474 DS_ERROR = -1,
1475 DS_UNCHANGED = 0, 1475 DS_UNCHANGED = 0,
1476 DS_SHRUNK = 1, 1476 DS_SHRUNK = 1,
1477 DS_GREW = 2 1477 DS_GREW = 2,
1478 DS_GREW_FROM_ZERO = 3,
1478}; 1479};
1479extern enum determine_dev_size 1480extern enum determine_dev_size
1480drbd_determine_dev_size(struct drbd_conf *, enum dds_flags, struct resize_parms *) __must_hold(local); 1481drbd_determine_dev_size(struct drbd_conf *, enum dds_flags, struct resize_parms *) __must_hold(local);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 55635edf563b..9e3818b1bc83 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2750,13 +2750,6 @@ int __init drbd_init(void)
2750 return err; 2750 return err;
2751 } 2751 }
2752 2752
2753 err = drbd_genl_register();
2754 if (err) {
2755 printk(KERN_ERR "drbd: unable to register generic netlink family\n");
2756 goto fail;
2757 }
2758
2759
2760 register_reboot_notifier(&drbd_notifier); 2753 register_reboot_notifier(&drbd_notifier);
2761 2754
2762 /* 2755 /*
@@ -2767,6 +2760,15 @@ int __init drbd_init(void)
2767 drbd_proc = NULL; /* play safe for drbd_cleanup */ 2760 drbd_proc = NULL; /* play safe for drbd_cleanup */
2768 idr_init(&minors); 2761 idr_init(&minors);
2769 2762
2763 rwlock_init(&global_state_lock);
2764 INIT_LIST_HEAD(&drbd_tconns);
2765
2766 err = drbd_genl_register();
2767 if (err) {
2768 printk(KERN_ERR "drbd: unable to register generic netlink family\n");
2769 goto fail;
2770 }
2771
2770 err = drbd_create_mempools(); 2772 err = drbd_create_mempools();
2771 if (err) 2773 if (err)
2772 goto fail; 2774 goto fail;
@@ -2778,9 +2780,6 @@ int __init drbd_init(void)
2778 goto fail; 2780 goto fail;
2779 } 2781 }
2780 2782
2781 rwlock_init(&global_state_lock);
2782 INIT_LIST_HEAD(&drbd_tconns);
2783
2784 retry.wq = create_singlethread_workqueue("drbd-reissue"); 2783 retry.wq = create_singlethread_workqueue("drbd-reissue");
2785 if (!retry.wq) { 2784 if (!retry.wq) {
2786 printk(KERN_ERR "drbd: unable to create retry workqueue\n"); 2785 printk(KERN_ERR "drbd: unable to create retry workqueue\n");
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 8cc1e640f485..c706d50a8b06 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -955,7 +955,7 @@ drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags, struct res
955 } 955 }
956 956
957 if (size > la_size_sect) 957 if (size > la_size_sect)
958 rv = DS_GREW; 958 rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO;
959 if (size < la_size_sect) 959 if (size < la_size_sect)
960 rv = DS_SHRUNK; 960 rv = DS_SHRUNK;
961 961
@@ -1132,9 +1132,9 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
1132 /* We may ignore peer limits if the peer is modern enough. 1132 /* We may ignore peer limits if the peer is modern enough.
1133 Because new from 8.3.8 onwards the peer can use multiple 1133 Because new from 8.3.8 onwards the peer can use multiple
1134 BIOs for a single peer_request */ 1134 BIOs for a single peer_request */
1135 if (mdev->state.conn >= C_CONNECTED) { 1135 if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
1136 if (mdev->tconn->agreed_pro_version < 94) 1136 if (mdev->tconn->agreed_pro_version < 94)
1137 peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); 1137 peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
1138 /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ 1138 /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
1139 else if (mdev->tconn->agreed_pro_version == 94) 1139 else if (mdev->tconn->agreed_pro_version == 94)
1140 peer = DRBD_MAX_SIZE_H80_PACKET; 1140 peer = DRBD_MAX_SIZE_H80_PACKET;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index cc29cd3bf78b..6fa6673b36b3 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1890,29 +1890,11 @@ static u32 seq_max(u32 a, u32 b)
1890 return seq_greater(a, b) ? a : b; 1890 return seq_greater(a, b) ? a : b;
1891} 1891}
1892 1892
1893static bool need_peer_seq(struct drbd_conf *mdev)
1894{
1895 struct drbd_tconn *tconn = mdev->tconn;
1896 int tp;
1897
1898 /*
1899 * We only need to keep track of the last packet_seq number of our peer
1900 * if we are in dual-primary mode and we have the resolve-conflicts flag set; see
1901 * handle_write_conflicts().
1902 */
1903
1904 rcu_read_lock();
1905 tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
1906 rcu_read_unlock();
1907
1908 return tp && test_bit(RESOLVE_CONFLICTS, &tconn->flags);
1909}
1910
1911static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq) 1893static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq)
1912{ 1894{
1913 unsigned int newest_peer_seq; 1895 unsigned int newest_peer_seq;
1914 1896
1915 if (need_peer_seq(mdev)) { 1897 if (test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags)) {
1916 spin_lock(&mdev->peer_seq_lock); 1898 spin_lock(&mdev->peer_seq_lock);
1917 newest_peer_seq = seq_max(mdev->peer_seq, peer_seq); 1899 newest_peer_seq = seq_max(mdev->peer_seq, peer_seq);
1918 mdev->peer_seq = newest_peer_seq; 1900 mdev->peer_seq = newest_peer_seq;
@@ -1972,22 +1954,31 @@ static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_s
1972{ 1954{
1973 DEFINE_WAIT(wait); 1955 DEFINE_WAIT(wait);
1974 long timeout; 1956 long timeout;
1975 int ret; 1957 int ret = 0, tp;
1976 1958
1977 if (!need_peer_seq(mdev)) 1959 if (!test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags))
1978 return 0; 1960 return 0;
1979 1961
1980 spin_lock(&mdev->peer_seq_lock); 1962 spin_lock(&mdev->peer_seq_lock);
1981 for (;;) { 1963 for (;;) {
1982 if (!seq_greater(peer_seq - 1, mdev->peer_seq)) { 1964 if (!seq_greater(peer_seq - 1, mdev->peer_seq)) {
1983 mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq); 1965 mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq);
1984 ret = 0;
1985 break; 1966 break;
1986 } 1967 }
1968
1987 if (signal_pending(current)) { 1969 if (signal_pending(current)) {
1988 ret = -ERESTARTSYS; 1970 ret = -ERESTARTSYS;
1989 break; 1971 break;
1990 } 1972 }
1973
1974 rcu_read_lock();
1975 tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
1976 rcu_read_unlock();
1977
1978 if (!tp)
1979 break;
1980
1981 /* Only need to wait if two_primaries is enabled */
1991 prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); 1982 prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
1992 spin_unlock(&mdev->peer_seq_lock); 1983 spin_unlock(&mdev->peer_seq_lock);
1993 rcu_read_lock(); 1984 rcu_read_lock();
@@ -2228,8 +2219,10 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
2228 } 2219 }
2229 goto out_interrupted; 2220 goto out_interrupted;
2230 } 2221 }
2231 } else 2222 } else {
2223 update_peer_seq(mdev, peer_seq);
2232 spin_lock_irq(&mdev->tconn->req_lock); 2224 spin_lock_irq(&mdev->tconn->req_lock);
2225 }
2233 list_add(&peer_req->w.list, &mdev->active_ee); 2226 list_add(&peer_req->w.list, &mdev->active_ee);
2234 spin_unlock_irq(&mdev->tconn->req_lock); 2227 spin_unlock_irq(&mdev->tconn->req_lock);
2235 2228
@@ -4132,7 +4125,11 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
4132 (unsigned int)bs.buf_len); 4125 (unsigned int)bs.buf_len);
4133 return -EIO; 4126 return -EIO;
4134 } 4127 }
4135 look_ahead >>= bits; 4128 /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
4129 if (likely(bits < 64))
4130 look_ahead >>= bits;
4131 else
4132 look_ahead = 0;
4136 have -= bits; 4133 have -= bits;
4137 4134
4138 bits = bitstream_get_bits(&bs, &tmp, 64 - have); 4135 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index c24379ffd4e3..fec7bef44994 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1306,6 +1306,7 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
1306 int backing_limit; 1306 int backing_limit;
1307 1307
1308 if (bio_size && get_ldev(mdev)) { 1308 if (bio_size && get_ldev(mdev)) {
1309 unsigned int max_hw_sectors = queue_max_hw_sectors(q);
1309 struct request_queue * const b = 1310 struct request_queue * const b =
1310 mdev->ldev->backing_bdev->bd_disk->queue; 1311 mdev->ldev->backing_bdev->bd_disk->queue;
1311 if (b->merge_bvec_fn) { 1312 if (b->merge_bvec_fn) {
@@ -1313,6 +1314,8 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
1313 limit = min(limit, backing_limit); 1314 limit = min(limit, backing_limit);
1314 } 1315 }
1315 put_ldev(mdev); 1316 put_ldev(mdev);
1317 if ((limit >> 9) > max_hw_sectors)
1318 limit = max_hw_sectors << 9;
1316 } 1319 }
1317 return limit; 1320 return limit;
1318} 1321}
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index dbdb88a4976c..c8dac7305244 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -894,13 +894,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
894 894
895 bio_list_init(&lo->lo_bio_list); 895 bio_list_init(&lo->lo_bio_list);
896 896
897 /*
898 * set queue make_request_fn, and add limits based on lower level
899 * device
900 */
901 blk_queue_make_request(lo->lo_queue, loop_make_request);
902 lo->lo_queue->queuedata = lo;
903
904 if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) 897 if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
905 blk_queue_flush(lo->lo_queue, REQ_FLUSH); 898 blk_queue_flush(lo->lo_queue, REQ_FLUSH);
906 899
@@ -1618,6 +1611,8 @@ static int loop_add(struct loop_device **l, int i)
1618 if (!lo) 1611 if (!lo)
1619 goto out; 1612 goto out;
1620 1613
1614 lo->lo_state = Lo_unbound;
1615
1621 /* allocate id, if @id >= 0, we're requesting that specific id */ 1616 /* allocate id, if @id >= 0, we're requesting that specific id */
1622 if (i >= 0) { 1617 if (i >= 0) {
1623 err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL); 1618 err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL);
@@ -1635,6 +1630,12 @@ static int loop_add(struct loop_device **l, int i)
1635 if (!lo->lo_queue) 1630 if (!lo->lo_queue)
1636 goto out_free_idr; 1631 goto out_free_idr;
1637 1632
1633 /*
1634 * set queue make_request_fn
1635 */
1636 blk_queue_make_request(lo->lo_queue, loop_make_request);
1637 lo->lo_queue->queuedata = lo;
1638
1638 disk = lo->lo_disk = alloc_disk(1 << part_shift); 1639 disk = lo->lo_disk = alloc_disk(1 << part_shift);
1639 if (!disk) 1640 if (!disk)
1640 goto out_free_queue; 1641 goto out_free_queue;
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 77a60bedd7a3..7bc363f1ee82 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -936,7 +936,7 @@ static int mg_probe(struct platform_device *plat_dev)
936 goto probe_err_3b; 936 goto probe_err_3b;
937 } 937 }
938 err = request_irq(host->irq, mg_irq, 938 err = request_irq(host->irq, mg_irq,
939 IRQF_DISABLED | IRQF_TRIGGER_RISING, 939 IRQF_TRIGGER_RISING,
940 MG_DEV_NAME, host); 940 MG_DEV_NAME, host);
941 if (err) { 941 if (err) {
942 printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n", 942 printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n",
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 952dbfe22126..050c71267f14 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -126,64 +126,30 @@ struct mtip_compat_ide_task_request_s {
126static bool mtip_check_surprise_removal(struct pci_dev *pdev) 126static bool mtip_check_surprise_removal(struct pci_dev *pdev)
127{ 127{
128 u16 vendor_id = 0; 128 u16 vendor_id = 0;
129 struct driver_data *dd = pci_get_drvdata(pdev);
130
131 if (dd->sr)
132 return true;
129 133
130 /* Read the vendorID from the configuration space */ 134 /* Read the vendorID from the configuration space */
131 pci_read_config_word(pdev, 0x00, &vendor_id); 135 pci_read_config_word(pdev, 0x00, &vendor_id);
132 if (vendor_id == 0xFFFF) 136 if (vendor_id == 0xFFFF) {
137 dd->sr = true;
138 if (dd->queue)
139 set_bit(QUEUE_FLAG_DEAD, &dd->queue->queue_flags);
140 else
141 dev_warn(&dd->pdev->dev,
142 "%s: dd->queue is NULL\n", __func__);
143 if (dd->port) {
144 set_bit(MTIP_PF_SR_CLEANUP_BIT, &dd->port->flags);
145 wake_up_interruptible(&dd->port->svc_wait);
146 } else
147 dev_warn(&dd->pdev->dev,
148 "%s: dd->port is NULL\n", __func__);
133 return true; /* device removed */ 149 return true; /* device removed */
134
135 return false; /* device present */
136}
137
138/*
139 * This function is called for clean the pending command in the
140 * command slot during the surprise removal of device and return
141 * error to the upper layer.
142 *
143 * @dd Pointer to the DRIVER_DATA structure.
144 *
145 * return value
146 * None
147 */
148static void mtip_command_cleanup(struct driver_data *dd)
149{
150 int group = 0, commandslot = 0, commandindex = 0;
151 struct mtip_cmd *command;
152 struct mtip_port *port = dd->port;
153 static int in_progress;
154
155 if (in_progress)
156 return;
157
158 in_progress = 1;
159
160 for (group = 0; group < 4; group++) {
161 for (commandslot = 0; commandslot < 32; commandslot++) {
162 if (!(port->allocated[group] & (1 << commandslot)))
163 continue;
164
165 commandindex = group << 5 | commandslot;
166 command = &port->commands[commandindex];
167
168 if (atomic_read(&command->active)
169 && (command->async_callback)) {
170 command->async_callback(command->async_data,
171 -ENODEV);
172 command->async_callback = NULL;
173 command->async_data = NULL;
174 }
175
176 dma_unmap_sg(&port->dd->pdev->dev,
177 command->sg,
178 command->scatter_ents,
179 command->direction);
180 }
181 } 150 }
182 151
183 up(&port->cmd_slot); 152 return false; /* device present */
184
185 set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag);
186 in_progress = 0;
187} 153}
188 154
189/* 155/*
@@ -222,10 +188,7 @@ static int get_slot(struct mtip_port *port)
222 } 188 }
223 dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n"); 189 dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n");
224 190
225 if (mtip_check_surprise_removal(port->dd->pdev)) { 191 mtip_check_surprise_removal(port->dd->pdev);
226 /* Device not present, clean outstanding commands */
227 mtip_command_cleanup(port->dd);
228 }
229 return -1; 192 return -1;
230} 193}
231 194
@@ -246,6 +209,107 @@ static inline void release_slot(struct mtip_port *port, int tag)
246} 209}
247 210
248/* 211/*
212 * IO completion function.
213 *
214 * This completion function is called by the driver ISR when a
215 * command that was issued by the kernel completes. It first calls the
216 * asynchronous completion function which normally calls back into the block
217 * layer passing the asynchronous callback data, then unmaps the
218 * scatter list associated with the completed command, and finally
219 * clears the allocated bit associated with the completed command.
220 *
221 * @port Pointer to the port data structure.
222 * @tag Tag of the command.
223 * @data Pointer to driver_data.
224 * @status Completion status.
225 *
226 * return value
227 * None
228 */
229static void mtip_async_complete(struct mtip_port *port,
230 int tag,
231 void *data,
232 int status)
233{
234 struct mtip_cmd *command;
235 struct driver_data *dd = data;
236 int cb_status = status ? -EIO : 0;
237
238 if (unlikely(!dd) || unlikely(!port))
239 return;
240
241 command = &port->commands[tag];
242
243 if (unlikely(status == PORT_IRQ_TF_ERR)) {
244 dev_warn(&port->dd->pdev->dev,
245 "Command tag %d failed due to TFE\n", tag);
246 }
247
248 /* Upper layer callback */
249 if (likely(command->async_callback))
250 command->async_callback(command->async_data, cb_status);
251
252 command->async_callback = NULL;
253 command->comp_func = NULL;
254
255 /* Unmap the DMA scatter list entries */
256 dma_unmap_sg(&dd->pdev->dev,
257 command->sg,
258 command->scatter_ents,
259 command->direction);
260
261 /* Clear the allocated and active bits for the command */
262 atomic_set(&port->commands[tag].active, 0);
263 release_slot(port, tag);
264
265 up(&port->cmd_slot);
266}
267
268/*
269 * This function is called for clean the pending command in the
270 * command slot during the surprise removal of device and return
271 * error to the upper layer.
272 *
273 * @dd Pointer to the DRIVER_DATA structure.
274 *
275 * return value
276 * None
277 */
278static void mtip_command_cleanup(struct driver_data *dd)
279{
280 int tag = 0;
281 struct mtip_cmd *cmd;
282 struct mtip_port *port = dd->port;
283 unsigned int num_cmd_slots = dd->slot_groups * 32;
284
285 if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag))
286 return;
287
288 if (!port)
289 return;
290
291 cmd = &port->commands[MTIP_TAG_INTERNAL];
292 if (atomic_read(&cmd->active))
293 if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) &
294 (1 << MTIP_TAG_INTERNAL))
295 if (cmd->comp_func)
296 cmd->comp_func(port, MTIP_TAG_INTERNAL,
297 cmd->comp_data, -ENODEV);
298
299 while (1) {
300 tag = find_next_bit(port->allocated, num_cmd_slots, tag);
301 if (tag >= num_cmd_slots)
302 break;
303
304 cmd = &port->commands[tag];
305 if (atomic_read(&cmd->active))
306 mtip_async_complete(port, tag, dd, -ENODEV);
307 }
308
309 set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag);
310}
311
312/*
249 * Reset the HBA (without sleeping) 313 * Reset the HBA (without sleeping)
250 * 314 *
251 * @dd Pointer to the driver data structure. 315 * @dd Pointer to the driver data structure.
@@ -584,6 +648,9 @@ static void mtip_timeout_function(unsigned long int data)
584 if (unlikely(!port)) 648 if (unlikely(!port))
585 return; 649 return;
586 650
651 if (unlikely(port->dd->sr))
652 return;
653
587 if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) { 654 if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) {
588 mod_timer(&port->cmd_timer, 655 mod_timer(&port->cmd_timer,
589 jiffies + msecs_to_jiffies(30000)); 656 jiffies + msecs_to_jiffies(30000));
@@ -675,66 +742,6 @@ static void mtip_timeout_function(unsigned long int data)
675} 742}
676 743
677/* 744/*
678 * IO completion function.
679 *
680 * This completion function is called by the driver ISR when a
681 * command that was issued by the kernel completes. It first calls the
682 * asynchronous completion function which normally calls back into the block
683 * layer passing the asynchronous callback data, then unmaps the
684 * scatter list associated with the completed command, and finally
685 * clears the allocated bit associated with the completed command.
686 *
687 * @port Pointer to the port data structure.
688 * @tag Tag of the command.
689 * @data Pointer to driver_data.
690 * @status Completion status.
691 *
692 * return value
693 * None
694 */
695static void mtip_async_complete(struct mtip_port *port,
696 int tag,
697 void *data,
698 int status)
699{
700 struct mtip_cmd *command;
701 struct driver_data *dd = data;
702 int cb_status = status ? -EIO : 0;
703
704 if (unlikely(!dd) || unlikely(!port))
705 return;
706
707 command = &port->commands[tag];
708
709 if (unlikely(status == PORT_IRQ_TF_ERR)) {
710 dev_warn(&port->dd->pdev->dev,
711 "Command tag %d failed due to TFE\n", tag);
712 }
713
714 /* Upper layer callback */
715 if (likely(command->async_callback))
716 command->async_callback(command->async_data, cb_status);
717
718 command->async_callback = NULL;
719 command->comp_func = NULL;
720
721 /* Unmap the DMA scatter list entries */
722 dma_unmap_sg(&dd->pdev->dev,
723 command->sg,
724 command->scatter_ents,
725 command->direction);
726
727 /* Clear the allocated and active bits for the command */
728 atomic_set(&port->commands[tag].active, 0);
729 release_slot(port, tag);
730
731 if (unlikely(command->unaligned))
732 up(&port->cmd_slot_unal);
733 else
734 up(&port->cmd_slot);
735}
736
737/*
738 * Internal command completion callback function. 745 * Internal command completion callback function.
739 * 746 *
740 * This function is normally called by the driver ISR when an internal 747 * This function is normally called by the driver ISR when an internal
@@ -854,7 +861,6 @@ static void mtip_handle_tfe(struct driver_data *dd)
854 "Missing completion func for tag %d", 861 "Missing completion func for tag %d",
855 tag); 862 tag);
856 if (mtip_check_surprise_removal(dd->pdev)) { 863 if (mtip_check_surprise_removal(dd->pdev)) {
857 mtip_command_cleanup(dd);
858 /* don't proceed further */ 864 /* don't proceed further */
859 return; 865 return;
860 } 866 }
@@ -1018,14 +1024,12 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
1018 command->comp_data, 1024 command->comp_data,
1019 0); 1025 0);
1020 } else { 1026 } else {
1021 dev_warn(&dd->pdev->dev, 1027 dev_dbg(&dd->pdev->dev,
1022 "Null completion " 1028 "Null completion for tag %d",
1023 "for tag %d",
1024 tag); 1029 tag);
1025 1030
1026 if (mtip_check_surprise_removal( 1031 if (mtip_check_surprise_removal(
1027 dd->pdev)) { 1032 dd->pdev)) {
1028 mtip_command_cleanup(dd);
1029 return; 1033 return;
1030 } 1034 }
1031 } 1035 }
@@ -1145,7 +1149,6 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
1145 1149
1146 if (unlikely(port_stat & PORT_IRQ_ERR)) { 1150 if (unlikely(port_stat & PORT_IRQ_ERR)) {
1147 if (unlikely(mtip_check_surprise_removal(dd->pdev))) { 1151 if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
1148 mtip_command_cleanup(dd);
1149 /* don't proceed further */ 1152 /* don't proceed further */
1150 return IRQ_HANDLED; 1153 return IRQ_HANDLED;
1151 } 1154 }
@@ -2806,34 +2809,51 @@ static ssize_t show_device_status(struct device_driver *drv, char *buf)
2806static ssize_t mtip_hw_read_device_status(struct file *f, char __user *ubuf, 2809static ssize_t mtip_hw_read_device_status(struct file *f, char __user *ubuf,
2807 size_t len, loff_t *offset) 2810 size_t len, loff_t *offset)
2808{ 2811{
2812 struct driver_data *dd = (struct driver_data *)f->private_data;
2809 int size = *offset; 2813 int size = *offset;
2810 char buf[MTIP_DFS_MAX_BUF_SIZE]; 2814 char *buf;
2815 int rv = 0;
2811 2816
2812 if (!len || *offset) 2817 if (!len || *offset)
2813 return 0; 2818 return 0;
2814 2819
2820 buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
2821 if (!buf) {
2822 dev_err(&dd->pdev->dev,
2823 "Memory allocation: status buffer\n");
2824 return -ENOMEM;
2825 }
2826
2815 size += show_device_status(NULL, buf); 2827 size += show_device_status(NULL, buf);
2816 2828
2817 *offset = size <= len ? size : len; 2829 *offset = size <= len ? size : len;
2818 size = copy_to_user(ubuf, buf, *offset); 2830 size = copy_to_user(ubuf, buf, *offset);
2819 if (size) 2831 if (size)
2820 return -EFAULT; 2832 rv = -EFAULT;
2821 2833
2822 return *offset; 2834 kfree(buf);
2835 return rv ? rv : *offset;
2823} 2836}
2824 2837
2825static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf, 2838static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf,
2826 size_t len, loff_t *offset) 2839 size_t len, loff_t *offset)
2827{ 2840{
2828 struct driver_data *dd = (struct driver_data *)f->private_data; 2841 struct driver_data *dd = (struct driver_data *)f->private_data;
2829 char buf[MTIP_DFS_MAX_BUF_SIZE]; 2842 char *buf;
2830 u32 group_allocated; 2843 u32 group_allocated;
2831 int size = *offset; 2844 int size = *offset;
2832 int n; 2845 int n, rv = 0;
2833 2846
2834 if (!len || size) 2847 if (!len || size)
2835 return 0; 2848 return 0;
2836 2849
2850 buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
2851 if (!buf) {
2852 dev_err(&dd->pdev->dev,
2853 "Memory allocation: register buffer\n");
2854 return -ENOMEM;
2855 }
2856
2837 size += sprintf(&buf[size], "H/ S ACTive : [ 0x"); 2857 size += sprintf(&buf[size], "H/ S ACTive : [ 0x");
2838 2858
2839 for (n = dd->slot_groups-1; n >= 0; n--) 2859 for (n = dd->slot_groups-1; n >= 0; n--)
@@ -2888,21 +2908,30 @@ static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf,
2888 *offset = size <= len ? size : len; 2908 *offset = size <= len ? size : len;
2889 size = copy_to_user(ubuf, buf, *offset); 2909 size = copy_to_user(ubuf, buf, *offset);
2890 if (size) 2910 if (size)
2891 return -EFAULT; 2911 rv = -EFAULT;
2892 2912
2893 return *offset; 2913 kfree(buf);
2914 return rv ? rv : *offset;
2894} 2915}
2895 2916
2896static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf, 2917static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf,
2897 size_t len, loff_t *offset) 2918 size_t len, loff_t *offset)
2898{ 2919{
2899 struct driver_data *dd = (struct driver_data *)f->private_data; 2920 struct driver_data *dd = (struct driver_data *)f->private_data;
2900 char buf[MTIP_DFS_MAX_BUF_SIZE]; 2921 char *buf;
2901 int size = *offset; 2922 int size = *offset;
2923 int rv = 0;
2902 2924
2903 if (!len || size) 2925 if (!len || size)
2904 return 0; 2926 return 0;
2905 2927
2928 buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
2929 if (!buf) {
2930 dev_err(&dd->pdev->dev,
2931 "Memory allocation: flag buffer\n");
2932 return -ENOMEM;
2933 }
2934
2906 size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n", 2935 size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n",
2907 dd->port->flags); 2936 dd->port->flags);
2908 size += sprintf(&buf[size], "Flag-dd : [ %08lX ]\n", 2937 size += sprintf(&buf[size], "Flag-dd : [ %08lX ]\n",
@@ -2911,9 +2940,10 @@ static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf,
2911 *offset = size <= len ? size : len; 2940 *offset = size <= len ? size : len;
2912 size = copy_to_user(ubuf, buf, *offset); 2941 size = copy_to_user(ubuf, buf, *offset);
2913 if (size) 2942 if (size)
2914 return -EFAULT; 2943 rv = -EFAULT;
2915 2944
2916 return *offset; 2945 kfree(buf);
2946 return rv ? rv : *offset;
2917} 2947}
2918 2948
2919static const struct file_operations mtip_device_status_fops = { 2949static const struct file_operations mtip_device_status_fops = {
@@ -3006,6 +3036,46 @@ static void mtip_hw_debugfs_exit(struct driver_data *dd)
3006 debugfs_remove_recursive(dd->dfs_node); 3036 debugfs_remove_recursive(dd->dfs_node);
3007} 3037}
3008 3038
3039static int mtip_free_orphan(struct driver_data *dd)
3040{
3041 struct kobject *kobj;
3042
3043 if (dd->bdev) {
3044 if (dd->bdev->bd_holders >= 1)
3045 return -2;
3046
3047 bdput(dd->bdev);
3048 dd->bdev = NULL;
3049 }
3050
3051 mtip_hw_debugfs_exit(dd);
3052
3053 spin_lock(&rssd_index_lock);
3054 ida_remove(&rssd_index_ida, dd->index);
3055 spin_unlock(&rssd_index_lock);
3056
3057 if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag) &&
3058 test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) {
3059 put_disk(dd->disk);
3060 } else {
3061 if (dd->disk) {
3062 kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
3063 if (kobj) {
3064 mtip_hw_sysfs_exit(dd, kobj);
3065 kobject_put(kobj);
3066 }
3067 del_gendisk(dd->disk);
3068 dd->disk = NULL;
3069 }
3070 if (dd->queue) {
3071 dd->queue->queuedata = NULL;
3072 blk_cleanup_queue(dd->queue);
3073 dd->queue = NULL;
3074 }
3075 }
3076 kfree(dd);
3077 return 0;
3078}
3009 3079
3010/* 3080/*
3011 * Perform any init/resume time hardware setup 3081 * Perform any init/resume time hardware setup
@@ -3154,6 +3224,7 @@ static int mtip_service_thread(void *data)
3154 unsigned long slot, slot_start, slot_wrap; 3224 unsigned long slot, slot_start, slot_wrap;
3155 unsigned int num_cmd_slots = dd->slot_groups * 32; 3225 unsigned int num_cmd_slots = dd->slot_groups * 32;
3156 struct mtip_port *port = dd->port; 3226 struct mtip_port *port = dd->port;
3227 int ret;
3157 3228
3158 while (1) { 3229 while (1) {
3159 /* 3230 /*
@@ -3164,13 +3235,18 @@ static int mtip_service_thread(void *data)
3164 !(port->flags & MTIP_PF_PAUSE_IO)); 3235 !(port->flags & MTIP_PF_PAUSE_IO));
3165 3236
3166 if (kthread_should_stop()) 3237 if (kthread_should_stop())
3238 goto st_out;
3239
3240 set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
3241
3242 /* If I am an orphan, start self cleanup */
3243 if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags))
3167 break; 3244 break;
3168 3245
3169 if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, 3246 if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
3170 &dd->dd_flag))) 3247 &dd->dd_flag)))
3171 break; 3248 goto st_out;
3172 3249
3173 set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
3174 if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { 3250 if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
3175 slot = 1; 3251 slot = 1;
3176 /* used to restrict the loop to one iteration */ 3252 /* used to restrict the loop to one iteration */
@@ -3201,7 +3277,7 @@ static int mtip_service_thread(void *data)
3201 3277
3202 clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); 3278 clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
3203 } else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) { 3279 } else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) {
3204 if (!mtip_ftl_rebuild_poll(dd)) 3280 if (mtip_ftl_rebuild_poll(dd) < 0)
3205 set_bit(MTIP_DDF_REBUILD_FAILED_BIT, 3281 set_bit(MTIP_DDF_REBUILD_FAILED_BIT,
3206 &dd->dd_flag); 3282 &dd->dd_flag);
3207 clear_bit(MTIP_PF_REBUILD_BIT, &port->flags); 3283 clear_bit(MTIP_PF_REBUILD_BIT, &port->flags);
@@ -3209,8 +3285,30 @@ static int mtip_service_thread(void *data)
3209 clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); 3285 clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
3210 3286
3211 if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) 3287 if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
3288 goto st_out;
3289 }
3290
3291 /* wait for pci remove to exit */
3292 while (1) {
3293 if (test_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag))
3212 break; 3294 break;
3295 msleep_interruptible(1000);
3296 if (kthread_should_stop())
3297 goto st_out;
3298 }
3299
3300 while (1) {
3301 ret = mtip_free_orphan(dd);
3302 if (!ret) {
3303 /* NOTE: All data structures are invalid, do not
3304 * access any here */
3305 return 0;
3306 }
3307 msleep_interruptible(1000);
3308 if (kthread_should_stop())
3309 goto st_out;
3213 } 3310 }
3311st_out:
3214 return 0; 3312 return 0;
3215} 3313}
3216 3314
@@ -3437,13 +3535,13 @@ static int mtip_hw_init(struct driver_data *dd)
3437 rv = -EFAULT; 3535 rv = -EFAULT;
3438 goto out3; 3536 goto out3;
3439 } 3537 }
3538 mtip_dump_identify(dd->port);
3440 3539
3441 if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) == 3540 if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
3442 MTIP_FTL_REBUILD_MAGIC) { 3541 MTIP_FTL_REBUILD_MAGIC) {
3443 set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags); 3542 set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags);
3444 return MTIP_FTL_REBUILD_MAGIC; 3543 return MTIP_FTL_REBUILD_MAGIC;
3445 } 3544 }
3446 mtip_dump_identify(dd->port);
3447 3545
3448 /* check write protect, over temp and rebuild statuses */ 3546 /* check write protect, over temp and rebuild statuses */
3449 rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, 3547 rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
@@ -3467,8 +3565,8 @@ static int mtip_hw_init(struct driver_data *dd)
3467 } 3565 }
3468 if (buf[288] == 0xBF) { 3566 if (buf[288] == 0xBF) {
3469 dev_info(&dd->pdev->dev, 3567 dev_info(&dd->pdev->dev,
3470 "Drive indicates rebuild has failed.\n"); 3568 "Drive is in security locked state.\n");
3471 /* TODO */ 3569 set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag);
3472 } 3570 }
3473 } 3571 }
3474 3572
@@ -3523,9 +3621,8 @@ static int mtip_hw_exit(struct driver_data *dd)
3523 * Send standby immediate (E0h) to the drive so that it 3621 * Send standby immediate (E0h) to the drive so that it
3524 * saves its state. 3622 * saves its state.
3525 */ 3623 */
3526 if (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) { 3624 if (!dd->sr) {
3527 3625 if (!test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))
3528 if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags))
3529 if (mtip_standby_immediate(dd->port)) 3626 if (mtip_standby_immediate(dd->port))
3530 dev_warn(&dd->pdev->dev, 3627 dev_warn(&dd->pdev->dev,
3531 "STANDBY IMMEDIATE failed\n"); 3628 "STANDBY IMMEDIATE failed\n");
@@ -3551,6 +3648,7 @@ static int mtip_hw_exit(struct driver_data *dd)
3551 dd->port->command_list_dma); 3648 dd->port->command_list_dma);
3552 /* Free the memory allocated for the for structure. */ 3649 /* Free the memory allocated for the for structure. */
3553 kfree(dd->port); 3650 kfree(dd->port);
3651 dd->port = NULL;
3554 3652
3555 return 0; 3653 return 0;
3556} 3654}
@@ -3572,7 +3670,8 @@ static int mtip_hw_shutdown(struct driver_data *dd)
3572 * Send standby immediate (E0h) to the drive so that it 3670 * Send standby immediate (E0h) to the drive so that it
3573 * saves its state. 3671 * saves its state.
3574 */ 3672 */
3575 mtip_standby_immediate(dd->port); 3673 if (!dd->sr && dd->port)
3674 mtip_standby_immediate(dd->port);
3576 3675
3577 return 0; 3676 return 0;
3578} 3677}
@@ -3887,6 +3986,10 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
3887 bio_endio(bio, -ENODATA); 3986 bio_endio(bio, -ENODATA);
3888 return; 3987 return;
3889 } 3988 }
3989 if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) {
3990 bio_endio(bio, -ENXIO);
3991 return;
3992 }
3890 } 3993 }
3891 3994
3892 if (unlikely(bio->bi_rw & REQ_DISCARD)) { 3995 if (unlikely(bio->bi_rw & REQ_DISCARD)) {
@@ -4010,6 +4113,8 @@ static int mtip_block_initialize(struct driver_data *dd)
4010 dd->disk->private_data = dd; 4113 dd->disk->private_data = dd;
4011 dd->index = index; 4114 dd->index = index;
4012 4115
4116 mtip_hw_debugfs_init(dd);
4117
4013 /* 4118 /*
4014 * if rebuild pending, start the service thread, and delay the block 4119 * if rebuild pending, start the service thread, and delay the block
4015 * queue creation and add_disk() 4120 * queue creation and add_disk()
@@ -4068,6 +4173,7 @@ skip_create_disk:
4068 /* Enable the block device and add it to /dev */ 4173 /* Enable the block device and add it to /dev */
4069 add_disk(dd->disk); 4174 add_disk(dd->disk);
4070 4175
4176 dd->bdev = bdget_disk(dd->disk, 0);
4071 /* 4177 /*
4072 * Now that the disk is active, initialize any sysfs attributes 4178 * Now that the disk is active, initialize any sysfs attributes
4073 * managed by the protocol layer. 4179 * managed by the protocol layer.
@@ -4077,7 +4183,6 @@ skip_create_disk:
4077 mtip_hw_sysfs_init(dd, kobj); 4183 mtip_hw_sysfs_init(dd, kobj);
4078 kobject_put(kobj); 4184 kobject_put(kobj);
4079 } 4185 }
4080 mtip_hw_debugfs_init(dd);
4081 4186
4082 if (dd->mtip_svc_handler) { 4187 if (dd->mtip_svc_handler) {
4083 set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag); 4188 set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
@@ -4103,7 +4208,8 @@ start_service_thread:
4103 return rv; 4208 return rv;
4104 4209
4105kthread_run_error: 4210kthread_run_error:
4106 mtip_hw_debugfs_exit(dd); 4211 bdput(dd->bdev);
4212 dd->bdev = NULL;
4107 4213
4108 /* Delete our gendisk. This also removes the device from /dev */ 4214 /* Delete our gendisk. This also removes the device from /dev */
4109 del_gendisk(dd->disk); 4215 del_gendisk(dd->disk);
@@ -4112,6 +4218,7 @@ read_capacity_error:
4112 blk_cleanup_queue(dd->queue); 4218 blk_cleanup_queue(dd->queue);
4113 4219
4114block_queue_alloc_init_error: 4220block_queue_alloc_init_error:
4221 mtip_hw_debugfs_exit(dd);
4115disk_index_error: 4222disk_index_error:
4116 spin_lock(&rssd_index_lock); 4223 spin_lock(&rssd_index_lock);
4117 ida_remove(&rssd_index_ida, index); 4224 ida_remove(&rssd_index_ida, index);
@@ -4141,40 +4248,48 @@ static int mtip_block_remove(struct driver_data *dd)
4141{ 4248{
4142 struct kobject *kobj; 4249 struct kobject *kobj;
4143 4250
4144 if (dd->mtip_svc_handler) { 4251 if (!dd->sr) {
4145 set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); 4252 mtip_hw_debugfs_exit(dd);
4146 wake_up_interruptible(&dd->port->svc_wait);
4147 kthread_stop(dd->mtip_svc_handler);
4148 }
4149 4253
4150 /* Clean up the sysfs attributes, if created */ 4254 if (dd->mtip_svc_handler) {
4151 if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) { 4255 set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags);
4152 kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); 4256 wake_up_interruptible(&dd->port->svc_wait);
4153 if (kobj) { 4257 kthread_stop(dd->mtip_svc_handler);
4154 mtip_hw_sysfs_exit(dd, kobj);
4155 kobject_put(kobj);
4156 } 4258 }
4157 }
4158 mtip_hw_debugfs_exit(dd);
4159 4259
4160 /* 4260 /* Clean up the sysfs attributes, if created */
4161 * Delete our gendisk structure. This also removes the device 4261 if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) {
4162 * from /dev 4262 kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
4163 */ 4263 if (kobj) {
4164 if (dd->disk) { 4264 mtip_hw_sysfs_exit(dd, kobj);
4165 if (dd->disk->queue) 4265 kobject_put(kobj);
4166 del_gendisk(dd->disk); 4266 }
4167 else 4267 }
4168 put_disk(dd->disk); 4268 /*
4169 } 4269 * Delete our gendisk structure. This also removes the device
4170 4270 * from /dev
4171 spin_lock(&rssd_index_lock); 4271 */
4172 ida_remove(&rssd_index_ida, dd->index); 4272 if (dd->bdev) {
4173 spin_unlock(&rssd_index_lock); 4273 bdput(dd->bdev);
4274 dd->bdev = NULL;
4275 }
4276 if (dd->disk) {
4277 if (dd->disk->queue) {
4278 del_gendisk(dd->disk);
4279 blk_cleanup_queue(dd->queue);
4280 dd->queue = NULL;
4281 } else
4282 put_disk(dd->disk);
4283 }
4284 dd->disk = NULL;
4174 4285
4175 blk_cleanup_queue(dd->queue); 4286 spin_lock(&rssd_index_lock);
4176 dd->disk = NULL; 4287 ida_remove(&rssd_index_ida, dd->index);
4177 dd->queue = NULL; 4288 spin_unlock(&rssd_index_lock);
4289 } else {
4290 dev_info(&dd->pdev->dev, "device %s surprise removal\n",
4291 dd->disk->disk_name);
4292 }
4178 4293
4179 /* De-initialize the protocol layer. */ 4294 /* De-initialize the protocol layer. */
4180 mtip_hw_exit(dd); 4295 mtip_hw_exit(dd);
@@ -4490,8 +4605,7 @@ done:
4490static void mtip_pci_remove(struct pci_dev *pdev) 4605static void mtip_pci_remove(struct pci_dev *pdev)
4491{ 4606{
4492 struct driver_data *dd = pci_get_drvdata(pdev); 4607 struct driver_data *dd = pci_get_drvdata(pdev);
4493 int counter = 0; 4608 unsigned long flags, to;
4494 unsigned long flags;
4495 4609
4496 set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); 4610 set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag);
4497 4611
@@ -4500,17 +4614,22 @@ static void mtip_pci_remove(struct pci_dev *pdev)
4500 list_add(&dd->remove_list, &removing_list); 4614 list_add(&dd->remove_list, &removing_list);
4501 spin_unlock_irqrestore(&dev_lock, flags); 4615 spin_unlock_irqrestore(&dev_lock, flags);
4502 4616
4503 if (mtip_check_surprise_removal(pdev)) { 4617 mtip_check_surprise_removal(pdev);
4504 while (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) { 4618 synchronize_irq(dd->pdev->irq);
4505 counter++; 4619
4506 msleep(20); 4620 /* Spin until workers are done */
4507 if (counter == 10) { 4621 to = jiffies + msecs_to_jiffies(4000);
4508 /* Cleanup the outstanding commands */ 4622 do {
4509 mtip_command_cleanup(dd); 4623 msleep(20);
4510 break; 4624 } while (atomic_read(&dd->irq_workers_active) != 0 &&
4511 } 4625 time_before(jiffies, to));
4512 } 4626
4627 if (atomic_read(&dd->irq_workers_active) != 0) {
4628 dev_warn(&dd->pdev->dev,
4629 "Completion workers still active!\n");
4513 } 4630 }
4631 /* Cleanup the outstanding commands */
4632 mtip_command_cleanup(dd);
4514 4633
4515 /* Clean up the block layer. */ 4634 /* Clean up the block layer. */
4516 mtip_block_remove(dd); 4635 mtip_block_remove(dd);
@@ -4529,8 +4648,15 @@ static void mtip_pci_remove(struct pci_dev *pdev)
4529 list_del_init(&dd->remove_list); 4648 list_del_init(&dd->remove_list);
4530 spin_unlock_irqrestore(&dev_lock, flags); 4649 spin_unlock_irqrestore(&dev_lock, flags);
4531 4650
4532 kfree(dd); 4651 if (!dd->sr)
4652 kfree(dd);
4653 else
4654 set_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag);
4655
4533 pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); 4656 pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
4657 pci_set_drvdata(pdev, NULL);
4658 pci_dev_put(pdev);
4659
4534} 4660}
4535 4661
4536/* 4662/*
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 3bb8a295fbe4..9be7a1582ad3 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -140,6 +140,7 @@ enum {
140 MTIP_PF_SVC_THD_ACTIVE_BIT = 4, 140 MTIP_PF_SVC_THD_ACTIVE_BIT = 4,
141 MTIP_PF_ISSUE_CMDS_BIT = 5, 141 MTIP_PF_ISSUE_CMDS_BIT = 5,
142 MTIP_PF_REBUILD_BIT = 6, 142 MTIP_PF_REBUILD_BIT = 6,
143 MTIP_PF_SR_CLEANUP_BIT = 7,
143 MTIP_PF_SVC_THD_STOP_BIT = 8, 144 MTIP_PF_SVC_THD_STOP_BIT = 8,
144 145
145 /* below are bit numbers in 'dd_flag' defined in driver_data */ 146 /* below are bit numbers in 'dd_flag' defined in driver_data */
@@ -147,15 +148,18 @@ enum {
147 MTIP_DDF_REMOVE_PENDING_BIT = 1, 148 MTIP_DDF_REMOVE_PENDING_BIT = 1,
148 MTIP_DDF_OVER_TEMP_BIT = 2, 149 MTIP_DDF_OVER_TEMP_BIT = 2,
149 MTIP_DDF_WRITE_PROTECT_BIT = 3, 150 MTIP_DDF_WRITE_PROTECT_BIT = 3,
150 MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | 151 MTIP_DDF_REMOVE_DONE_BIT = 4,
151 (1 << MTIP_DDF_SEC_LOCK_BIT) |
152 (1 << MTIP_DDF_OVER_TEMP_BIT) |
153 (1 << MTIP_DDF_WRITE_PROTECT_BIT)),
154
155 MTIP_DDF_CLEANUP_BIT = 5, 152 MTIP_DDF_CLEANUP_BIT = 5,
156 MTIP_DDF_RESUME_BIT = 6, 153 MTIP_DDF_RESUME_BIT = 6,
157 MTIP_DDF_INIT_DONE_BIT = 7, 154 MTIP_DDF_INIT_DONE_BIT = 7,
158 MTIP_DDF_REBUILD_FAILED_BIT = 8, 155 MTIP_DDF_REBUILD_FAILED_BIT = 8,
156
157 MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) |
158 (1 << MTIP_DDF_SEC_LOCK_BIT) |
159 (1 << MTIP_DDF_OVER_TEMP_BIT) |
160 (1 << MTIP_DDF_WRITE_PROTECT_BIT) |
161 (1 << MTIP_DDF_REBUILD_FAILED_BIT)),
162
159}; 163};
160 164
161struct smart_attr { 165struct smart_attr {
@@ -499,6 +503,8 @@ struct driver_data {
499 503
500 bool trim_supp; /* flag indicating trim support */ 504 bool trim_supp; /* flag indicating trim support */
501 505
506 bool sr;
507
502 int numa_node; /* NUMA support */ 508 int numa_node; /* NUMA support */
503 509
504 char workq_name[32]; 510 char workq_name[32];
@@ -511,6 +517,8 @@ struct driver_data {
511 517
512 int isr_binding; 518 int isr_binding;
513 519
520 struct block_device *bdev;
521
514 int unal_qdepth; /* qdepth of unaligned IO queue */ 522 int unal_qdepth; /* qdepth of unaligned IO queue */
515 523
516 struct list_head online_list; /* linkage for online list */ 524 struct list_head online_list; /* linkage for online list */
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 56188475cfd3..ff8668c5efb1 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -473,45 +473,31 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd)
473{ 473{
474 if (!pkt_debugfs_root) 474 if (!pkt_debugfs_root)
475 return; 475 return;
476 pd->dfs_f_info = NULL;
477 pd->dfs_d_root = debugfs_create_dir(pd->name, pkt_debugfs_root); 476 pd->dfs_d_root = debugfs_create_dir(pd->name, pkt_debugfs_root);
478 if (IS_ERR(pd->dfs_d_root)) { 477 if (!pd->dfs_d_root)
479 pd->dfs_d_root = NULL;
480 return; 478 return;
481 } 479
482 pd->dfs_f_info = debugfs_create_file("info", S_IRUGO, 480 pd->dfs_f_info = debugfs_create_file("info", S_IRUGO,
483 pd->dfs_d_root, pd, &debug_fops); 481 pd->dfs_d_root, pd, &debug_fops);
484 if (IS_ERR(pd->dfs_f_info)) {
485 pd->dfs_f_info = NULL;
486 return;
487 }
488} 482}
489 483
490static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd) 484static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd)
491{ 485{
492 if (!pkt_debugfs_root) 486 if (!pkt_debugfs_root)
493 return; 487 return;
494 if (pd->dfs_f_info) 488 debugfs_remove(pd->dfs_f_info);
495 debugfs_remove(pd->dfs_f_info); 489 debugfs_remove(pd->dfs_d_root);
496 pd->dfs_f_info = NULL; 490 pd->dfs_f_info = NULL;
497 if (pd->dfs_d_root)
498 debugfs_remove(pd->dfs_d_root);
499 pd->dfs_d_root = NULL; 491 pd->dfs_d_root = NULL;
500} 492}
501 493
502static void pkt_debugfs_init(void) 494static void pkt_debugfs_init(void)
503{ 495{
504 pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL); 496 pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL);
505 if (IS_ERR(pkt_debugfs_root)) {
506 pkt_debugfs_root = NULL;
507 return;
508 }
509} 497}
510 498
511static void pkt_debugfs_cleanup(void) 499static void pkt_debugfs_cleanup(void)
512{ 500{
513 if (!pkt_debugfs_root)
514 return;
515 debugfs_remove(pkt_debugfs_root); 501 debugfs_remove(pkt_debugfs_root);
516 pkt_debugfs_root = NULL; 502 pkt_debugfs_root = NULL;
517} 503}
diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c
index 6e85e21445eb..a8de2eec6ff3 100644
--- a/drivers/block/rsxx/core.c
+++ b/drivers/block/rsxx/core.c
@@ -654,7 +654,8 @@ static void rsxx_eeh_failure(struct pci_dev *dev)
654 for (i = 0; i < card->n_targets; i++) { 654 for (i = 0; i < card->n_targets; i++) {
655 spin_lock_bh(&card->ctrl[i].queue_lock); 655 spin_lock_bh(&card->ctrl[i].queue_lock);
656 cnt = rsxx_cleanup_dma_queue(&card->ctrl[i], 656 cnt = rsxx_cleanup_dma_queue(&card->ctrl[i],
657 &card->ctrl[i].queue); 657 &card->ctrl[i].queue,
658 COMPLETE_DMA);
658 spin_unlock_bh(&card->ctrl[i].queue_lock); 659 spin_unlock_bh(&card->ctrl[i].queue_lock);
659 660
660 cnt += rsxx_dma_cancel(&card->ctrl[i]); 661 cnt += rsxx_dma_cancel(&card->ctrl[i]);
@@ -748,10 +749,6 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
748 749
749 card->eeh_state = 0; 750 card->eeh_state = 0;
750 751
751 st = rsxx_eeh_remap_dmas(card);
752 if (st)
753 goto failed_remap_dmas;
754
755 spin_lock_irqsave(&card->irq_lock, flags); 752 spin_lock_irqsave(&card->irq_lock, flags);
756 if (card->n_targets & RSXX_MAX_TARGETS) 753 if (card->n_targets & RSXX_MAX_TARGETS)
757 rsxx_enable_ier_and_isr(card, CR_INTR_ALL_G); 754 rsxx_enable_ier_and_isr(card, CR_INTR_ALL_G);
@@ -778,7 +775,6 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
778 return PCI_ERS_RESULT_RECOVERED; 775 return PCI_ERS_RESULT_RECOVERED;
779 776
780failed_hw_buffers_init: 777failed_hw_buffers_init:
781failed_remap_dmas:
782 for (i = 0; i < card->n_targets; i++) { 778 for (i = 0; i < card->n_targets; i++) {
783 if (card->ctrl[i].status.buf) 779 if (card->ctrl[i].status.buf)
784 pci_free_consistent(card->dev, 780 pci_free_consistent(card->dev,
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index d7af441880be..2284f5d3a54a 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -295,13 +295,15 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
295 return -ENOMEM; 295 return -ENOMEM;
296 } 296 }
297 297
298 blk_size = card->config.data.block_size; 298 if (card->config_valid) {
299 blk_size = card->config.data.block_size;
300 blk_queue_dma_alignment(card->queue, blk_size - 1);
301 blk_queue_logical_block_size(card->queue, blk_size);
302 }
299 303
300 blk_queue_make_request(card->queue, rsxx_make_request); 304 blk_queue_make_request(card->queue, rsxx_make_request);
301 blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY); 305 blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY);
302 blk_queue_dma_alignment(card->queue, blk_size - 1);
303 blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors); 306 blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors);
304 blk_queue_logical_block_size(card->queue, blk_size);
305 blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE); 307 blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE);
306 308
307 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, card->queue); 309 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, card->queue);
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index bed32f16b084..fc88ba3e1bd2 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -221,6 +221,21 @@ static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card)
221} 221}
222 222
223/*----------------- RSXX DMA Handling -------------------*/ 223/*----------------- RSXX DMA Handling -------------------*/
224static void rsxx_free_dma(struct rsxx_dma_ctrl *ctrl, struct rsxx_dma *dma)
225{
226 if (dma->cmd != HW_CMD_BLK_DISCARD) {
227 if (!pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
228 pci_unmap_page(ctrl->card->dev, dma->dma_addr,
229 get_dma_size(dma),
230 dma->cmd == HW_CMD_BLK_WRITE ?
231 PCI_DMA_TODEVICE :
232 PCI_DMA_FROMDEVICE);
233 }
234 }
235
236 kmem_cache_free(rsxx_dma_pool, dma);
237}
238
224static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl, 239static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl,
225 struct rsxx_dma *dma, 240 struct rsxx_dma *dma,
226 unsigned int status) 241 unsigned int status)
@@ -232,21 +247,14 @@ static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl,
232 if (status & DMA_CANCELLED) 247 if (status & DMA_CANCELLED)
233 ctrl->stats.dma_cancelled++; 248 ctrl->stats.dma_cancelled++;
234 249
235 if (dma->dma_addr)
236 pci_unmap_page(ctrl->card->dev, dma->dma_addr,
237 get_dma_size(dma),
238 dma->cmd == HW_CMD_BLK_WRITE ?
239 PCI_DMA_TODEVICE :
240 PCI_DMA_FROMDEVICE);
241
242 if (dma->cb) 250 if (dma->cb)
243 dma->cb(ctrl->card, dma->cb_data, status ? 1 : 0); 251 dma->cb(ctrl->card, dma->cb_data, status ? 1 : 0);
244 252
245 kmem_cache_free(rsxx_dma_pool, dma); 253 rsxx_free_dma(ctrl, dma);
246} 254}
247 255
248int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, 256int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
249 struct list_head *q) 257 struct list_head *q, unsigned int done)
250{ 258{
251 struct rsxx_dma *dma; 259 struct rsxx_dma *dma;
252 struct rsxx_dma *tmp; 260 struct rsxx_dma *tmp;
@@ -254,7 +262,10 @@ int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
254 262
255 list_for_each_entry_safe(dma, tmp, q, list) { 263 list_for_each_entry_safe(dma, tmp, q, list) {
256 list_del(&dma->list); 264 list_del(&dma->list);
257 rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); 265 if (done & COMPLETE_DMA)
266 rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
267 else
268 rsxx_free_dma(ctrl, dma);
258 cnt++; 269 cnt++;
259 } 270 }
260 271
@@ -370,7 +381,7 @@ static void dma_engine_stalled(unsigned long data)
370 381
371 /* Clean up the DMA queue */ 382 /* Clean up the DMA queue */
372 spin_lock(&ctrl->queue_lock); 383 spin_lock(&ctrl->queue_lock);
373 cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue); 384 cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA);
374 spin_unlock(&ctrl->queue_lock); 385 spin_unlock(&ctrl->queue_lock);
375 386
376 cnt += rsxx_dma_cancel(ctrl); 387 cnt += rsxx_dma_cancel(ctrl);
@@ -388,6 +399,7 @@ static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl)
388 int tag; 399 int tag;
389 int cmds_pending = 0; 400 int cmds_pending = 0;
390 struct hw_cmd *hw_cmd_buf; 401 struct hw_cmd *hw_cmd_buf;
402 int dir;
391 403
392 hw_cmd_buf = ctrl->cmd.buf; 404 hw_cmd_buf = ctrl->cmd.buf;
393 405
@@ -424,6 +436,31 @@ static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl)
424 continue; 436 continue;
425 } 437 }
426 438
439 if (dma->cmd != HW_CMD_BLK_DISCARD) {
440 if (dma->cmd == HW_CMD_BLK_WRITE)
441 dir = PCI_DMA_TODEVICE;
442 else
443 dir = PCI_DMA_FROMDEVICE;
444
445 /*
446 * The function pci_map_page is placed here because we
447 * can only, by design, issue up to 255 commands to the
448 * hardware at one time per DMA channel. So the maximum
449 * amount of mapped memory would be 255 * 4 channels *
450 * 4096 Bytes which is less than 2GB, the limit of a x8
451 * Non-HWWD PCIe slot. This way the pci_map_page
452 * function should never fail because of a lack of
453 * mappable memory.
454 */
455 dma->dma_addr = pci_map_page(ctrl->card->dev, dma->page,
456 dma->pg_off, dma->sub_page.cnt << 9, dir);
457 if (pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
458 push_tracker(ctrl->trackers, tag);
459 rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
460 continue;
461 }
462 }
463
427 set_tracker_dma(ctrl->trackers, tag, dma); 464 set_tracker_dma(ctrl->trackers, tag, dma);
428 hw_cmd_buf[ctrl->cmd.idx].command = dma->cmd; 465 hw_cmd_buf[ctrl->cmd.idx].command = dma->cmd;
429 hw_cmd_buf[ctrl->cmd.idx].tag = tag; 466 hw_cmd_buf[ctrl->cmd.idx].tag = tag;
@@ -620,14 +657,6 @@ static int rsxx_queue_dma(struct rsxx_cardinfo *card,
620 if (!dma) 657 if (!dma)
621 return -ENOMEM; 658 return -ENOMEM;
622 659
623 dma->dma_addr = pci_map_page(card->dev, page, pg_off, dma_len,
624 dir ? PCI_DMA_TODEVICE :
625 PCI_DMA_FROMDEVICE);
626 if (!dma->dma_addr) {
627 kmem_cache_free(rsxx_dma_pool, dma);
628 return -ENOMEM;
629 }
630
631 dma->cmd = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ; 660 dma->cmd = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ;
632 dma->laddr = laddr; 661 dma->laddr = laddr;
633 dma->sub_page.off = (dma_off >> 9); 662 dma->sub_page.off = (dma_off >> 9);
@@ -736,11 +765,9 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
736 return 0; 765 return 0;
737 766
738bvec_err: 767bvec_err:
739 for (i = 0; i < card->n_targets; i++) { 768 for (i = 0; i < card->n_targets; i++)
740 spin_lock_bh(&card->ctrl[i].queue_lock); 769 rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i],
741 rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i]); 770 FREE_DMA);
742 spin_unlock_bh(&card->ctrl[i].queue_lock);
743 }
744 771
745 return st; 772 return st;
746} 773}
@@ -990,7 +1017,7 @@ void rsxx_dma_destroy(struct rsxx_cardinfo *card)
990 1017
991 /* Clean up the DMA queue */ 1018 /* Clean up the DMA queue */
992 spin_lock_bh(&ctrl->queue_lock); 1019 spin_lock_bh(&ctrl->queue_lock);
993 rsxx_cleanup_dma_queue(ctrl, &ctrl->queue); 1020 rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA);
994 spin_unlock_bh(&ctrl->queue_lock); 1021 spin_unlock_bh(&ctrl->queue_lock);
995 1022
996 rsxx_dma_cancel(ctrl); 1023 rsxx_dma_cancel(ctrl);
@@ -1032,6 +1059,14 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
1032 else 1059 else
1033 card->ctrl[i].stats.reads_issued--; 1060 card->ctrl[i].stats.reads_issued--;
1034 1061
1062 if (dma->cmd != HW_CMD_BLK_DISCARD) {
1063 pci_unmap_page(card->dev, dma->dma_addr,
1064 get_dma_size(dma),
1065 dma->cmd == HW_CMD_BLK_WRITE ?
1066 PCI_DMA_TODEVICE :
1067 PCI_DMA_FROMDEVICE);
1068 }
1069
1035 list_add_tail(&dma->list, &issued_dmas[i]); 1070 list_add_tail(&dma->list, &issued_dmas[i]);
1036 push_tracker(card->ctrl[i].trackers, j); 1071 push_tracker(card->ctrl[i].trackers, j);
1037 cnt++; 1072 cnt++;
@@ -1043,15 +1078,6 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
1043 atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth); 1078 atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth);
1044 card->ctrl[i].stats.sw_q_depth += cnt; 1079 card->ctrl[i].stats.sw_q_depth += cnt;
1045 card->ctrl[i].e_cnt = 0; 1080 card->ctrl[i].e_cnt = 0;
1046
1047 list_for_each_entry(dma, &card->ctrl[i].queue, list) {
1048 if (dma->dma_addr)
1049 pci_unmap_page(card->dev, dma->dma_addr,
1050 get_dma_size(dma),
1051 dma->cmd == HW_CMD_BLK_WRITE ?
1052 PCI_DMA_TODEVICE :
1053 PCI_DMA_FROMDEVICE);
1054 }
1055 spin_unlock_bh(&card->ctrl[i].queue_lock); 1081 spin_unlock_bh(&card->ctrl[i].queue_lock);
1056 } 1082 }
1057 1083
@@ -1060,31 +1086,6 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
1060 return 0; 1086 return 0;
1061} 1087}
1062 1088
1063int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card)
1064{
1065 struct rsxx_dma *dma;
1066 int i;
1067
1068 for (i = 0; i < card->n_targets; i++) {
1069 spin_lock_bh(&card->ctrl[i].queue_lock);
1070 list_for_each_entry(dma, &card->ctrl[i].queue, list) {
1071 dma->dma_addr = pci_map_page(card->dev, dma->page,
1072 dma->pg_off, get_dma_size(dma),
1073 dma->cmd == HW_CMD_BLK_WRITE ?
1074 PCI_DMA_TODEVICE :
1075 PCI_DMA_FROMDEVICE);
1076 if (!dma->dma_addr) {
1077 spin_unlock_bh(&card->ctrl[i].queue_lock);
1078 kmem_cache_free(rsxx_dma_pool, dma);
1079 return -ENOMEM;
1080 }
1081 }
1082 spin_unlock_bh(&card->ctrl[i].queue_lock);
1083 }
1084
1085 return 0;
1086}
1087
1088int rsxx_dma_init(void) 1089int rsxx_dma_init(void)
1089{ 1090{
1090 rsxx_dma_pool = KMEM_CACHE(rsxx_dma, SLAB_HWCACHE_ALIGN); 1091 rsxx_dma_pool = KMEM_CACHE(rsxx_dma, SLAB_HWCACHE_ALIGN);
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h
index 5ad5055a4104..6bbc64d0f690 100644
--- a/drivers/block/rsxx/rsxx_priv.h
+++ b/drivers/block/rsxx/rsxx_priv.h
@@ -52,7 +52,7 @@ struct proc_cmd;
52#define RS70_PCI_REV_SUPPORTED 4 52#define RS70_PCI_REV_SUPPORTED 4
53 53
54#define DRIVER_NAME "rsxx" 54#define DRIVER_NAME "rsxx"
55#define DRIVER_VERSION "4.0" 55#define DRIVER_VERSION "4.0.3.2516"
56 56
57/* Block size is 4096 */ 57/* Block size is 4096 */
58#define RSXX_HW_BLK_SHIFT 12 58#define RSXX_HW_BLK_SHIFT 12
@@ -345,6 +345,11 @@ enum rsxx_creg_stat {
345 CREG_STAT_TAG_MASK = 0x0000ff00, 345 CREG_STAT_TAG_MASK = 0x0000ff00,
346}; 346};
347 347
348enum rsxx_dma_finish {
349 FREE_DMA = 0x0,
350 COMPLETE_DMA = 0x1,
351};
352
348static inline unsigned int CREG_DATA(int N) 353static inline unsigned int CREG_DATA(int N)
349{ 354{
350 return CREG_DATA0 + (N << 2); 355 return CREG_DATA0 + (N << 2);
@@ -379,7 +384,9 @@ typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card,
379int rsxx_dma_setup(struct rsxx_cardinfo *card); 384int rsxx_dma_setup(struct rsxx_cardinfo *card);
380void rsxx_dma_destroy(struct rsxx_cardinfo *card); 385void rsxx_dma_destroy(struct rsxx_cardinfo *card);
381int rsxx_dma_init(void); 386int rsxx_dma_init(void);
382int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, struct list_head *q); 387int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
388 struct list_head *q,
389 unsigned int done);
383int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl); 390int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl);
384void rsxx_dma_cleanup(void); 391void rsxx_dma_cleanup(void);
385void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); 392void rsxx_dma_queue_reset(struct rsxx_cardinfo *card);
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
new file mode 100644
index 000000000000..9199c93be926
--- /dev/null
+++ b/drivers/block/skd_main.c
@@ -0,0 +1,5432 @@
1/* Copyright 2012 STEC, Inc.
2 *
3 * This file is licensed under the terms of the 3-clause
4 * BSD License (http://opensource.org/licenses/BSD-3-Clause)
5 * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html),
6 * at your option. Both licenses are also available in the LICENSE file
7 * distributed with this project. This file may not be copied, modified,
8 * or distributed except in accordance with those terms.
9 * Gordoni Waidhofer <gwaidhofer@stec-inc.com>
10 * Initial Driver Design!
11 * Thomas Swann <tswann@stec-inc.com>
12 * Interrupt handling.
13 * Ramprasad Chinthekindi <rchinthekindi@stec-inc.com>
14 * biomode implementation.
15 * Akhil Bhansali <abhansali@stec-inc.com>
16 * Added support for DISCARD / FLUSH and FUA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/module.h>
21#include <linux/init.h>
22#include <linux/pci.h>
23#include <linux/slab.h>
24#include <linux/spinlock.h>
25#include <linux/blkdev.h>
26#include <linux/sched.h>
27#include <linux/interrupt.h>
28#include <linux/compiler.h>
29#include <linux/workqueue.h>
30#include <linux/bitops.h>
31#include <linux/delay.h>
32#include <linux/time.h>
33#include <linux/hdreg.h>
34#include <linux/dma-mapping.h>
35#include <linux/completion.h>
36#include <linux/scatterlist.h>
37#include <linux/version.h>
38#include <linux/err.h>
39#include <linux/scatterlist.h>
40#include <linux/aer.h>
41#include <linux/ctype.h>
42#include <linux/wait.h>
43#include <linux/uio.h>
44#include <scsi/scsi.h>
45#include <scsi/sg.h>
46#include <linux/io.h>
47#include <linux/uaccess.h>
48#include <asm/unaligned.h>
49
50#include "skd_s1120.h"
51
52static int skd_dbg_level;
53static int skd_isr_comp_limit = 4;
54
55enum {
56 STEC_LINK_2_5GTS = 0,
57 STEC_LINK_5GTS = 1,
58 STEC_LINK_8GTS = 2,
59 STEC_LINK_UNKNOWN = 0xFF
60};
61
62enum {
63 SKD_FLUSH_INITIALIZER,
64 SKD_FLUSH_ZERO_SIZE_FIRST,
65 SKD_FLUSH_DATA_SECOND,
66};
67
68#define SKD_ASSERT(expr) \
69 do { \
70 if (unlikely(!(expr))) { \
71 pr_err("Assertion failed! %s,%s,%s,line=%d\n", \
72 # expr, __FILE__, __func__, __LINE__); \
73 } \
74 } while (0)
75
76#define DRV_NAME "skd"
77#define DRV_VERSION "2.2.1"
78#define DRV_BUILD_ID "0260"
79#define PFX DRV_NAME ": "
80#define DRV_BIN_VERSION 0x100
81#define DRV_VER_COMPL "2.2.1." DRV_BUILD_ID
82
83MODULE_AUTHOR("bug-reports: support@stec-inc.com");
84MODULE_LICENSE("Dual BSD/GPL");
85
86MODULE_DESCRIPTION("STEC s1120 PCIe SSD block driver (b" DRV_BUILD_ID ")");
87MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID);
88
89#define PCI_VENDOR_ID_STEC 0x1B39
90#define PCI_DEVICE_ID_S1120 0x0001
91
92#define SKD_FUA_NV (1 << 1)
93#define SKD_MINORS_PER_DEVICE 16
94
95#define SKD_MAX_QUEUE_DEPTH 200u
96
97#define SKD_PAUSE_TIMEOUT (5 * 1000)
98
99#define SKD_N_FITMSG_BYTES (512u)
100
101#define SKD_N_SPECIAL_CONTEXT 32u
102#define SKD_N_SPECIAL_FITMSG_BYTES (128u)
103
104/* SG elements are 32 bytes, so we can make this 4096 and still be under the
105 * 128KB limit. That allows 4096*4K = 16M xfer size
106 */
107#define SKD_N_SG_PER_REQ_DEFAULT 256u
108#define SKD_N_SG_PER_SPECIAL 256u
109
110#define SKD_N_COMPLETION_ENTRY 256u
111#define SKD_N_READ_CAP_BYTES (8u)
112
113#define SKD_N_INTERNAL_BYTES (512u)
114
115/* 5 bits of uniqifier, 0xF800 */
116#define SKD_ID_INCR (0x400)
117#define SKD_ID_TABLE_MASK (3u << 8u)
118#define SKD_ID_RW_REQUEST (0u << 8u)
119#define SKD_ID_INTERNAL (1u << 8u)
120#define SKD_ID_SPECIAL_REQUEST (2u << 8u)
121#define SKD_ID_FIT_MSG (3u << 8u)
122#define SKD_ID_SLOT_MASK 0x00FFu
123#define SKD_ID_SLOT_AND_TABLE_MASK 0x03FFu
124
125#define SKD_N_TIMEOUT_SLOT 4u
126#define SKD_TIMEOUT_SLOT_MASK 3u
127
128#define SKD_N_MAX_SECTORS 2048u
129
130#define SKD_MAX_RETRIES 2u
131
132#define SKD_TIMER_SECONDS(seconds) (seconds)
133#define SKD_TIMER_MINUTES(minutes) ((minutes) * (60))
134
135#define INQ_STD_NBYTES 36
136#define SKD_DISCARD_CDB_LENGTH 24
137
138enum skd_drvr_state {
139 SKD_DRVR_STATE_LOAD,
140 SKD_DRVR_STATE_IDLE,
141 SKD_DRVR_STATE_BUSY,
142 SKD_DRVR_STATE_STARTING,
143 SKD_DRVR_STATE_ONLINE,
144 SKD_DRVR_STATE_PAUSING,
145 SKD_DRVR_STATE_PAUSED,
146 SKD_DRVR_STATE_DRAINING_TIMEOUT,
147 SKD_DRVR_STATE_RESTARTING,
148 SKD_DRVR_STATE_RESUMING,
149 SKD_DRVR_STATE_STOPPING,
150 SKD_DRVR_STATE_FAULT,
151 SKD_DRVR_STATE_DISAPPEARED,
152 SKD_DRVR_STATE_PROTOCOL_MISMATCH,
153 SKD_DRVR_STATE_BUSY_ERASE,
154 SKD_DRVR_STATE_BUSY_SANITIZE,
155 SKD_DRVR_STATE_BUSY_IMMINENT,
156 SKD_DRVR_STATE_WAIT_BOOT,
157 SKD_DRVR_STATE_SYNCING,
158};
159
160#define SKD_WAIT_BOOT_TIMO SKD_TIMER_SECONDS(90u)
161#define SKD_STARTING_TIMO SKD_TIMER_SECONDS(8u)
162#define SKD_RESTARTING_TIMO SKD_TIMER_MINUTES(4u)
163#define SKD_DRAINING_TIMO SKD_TIMER_SECONDS(6u)
164#define SKD_BUSY_TIMO SKD_TIMER_MINUTES(20u)
165#define SKD_STARTED_BUSY_TIMO SKD_TIMER_SECONDS(60u)
166#define SKD_START_WAIT_SECONDS 90u
167
168enum skd_req_state {
169 SKD_REQ_STATE_IDLE,
170 SKD_REQ_STATE_SETUP,
171 SKD_REQ_STATE_BUSY,
172 SKD_REQ_STATE_COMPLETED,
173 SKD_REQ_STATE_TIMEOUT,
174 SKD_REQ_STATE_ABORTED,
175};
176
177enum skd_fit_msg_state {
178 SKD_MSG_STATE_IDLE,
179 SKD_MSG_STATE_BUSY,
180};
181
182enum skd_check_status_action {
183 SKD_CHECK_STATUS_REPORT_GOOD,
184 SKD_CHECK_STATUS_REPORT_SMART_ALERT,
185 SKD_CHECK_STATUS_REQUEUE_REQUEST,
186 SKD_CHECK_STATUS_REPORT_ERROR,
187 SKD_CHECK_STATUS_BUSY_IMMINENT,
188};
189
190struct skd_fitmsg_context {
191 enum skd_fit_msg_state state;
192
193 struct skd_fitmsg_context *next;
194
195 u32 id;
196 u16 outstanding;
197
198 u32 length;
199 u32 offset;
200
201 u8 *msg_buf;
202 dma_addr_t mb_dma_address;
203};
204
205struct skd_request_context {
206 enum skd_req_state state;
207
208 struct skd_request_context *next;
209
210 u16 id;
211 u32 fitmsg_id;
212
213 struct request *req;
214 u8 flush_cmd;
215 u8 discard_page;
216
217 u32 timeout_stamp;
218 u8 sg_data_dir;
219 struct scatterlist *sg;
220 u32 n_sg;
221 u32 sg_byte_count;
222
223 struct fit_sg_descriptor *sksg_list;
224 dma_addr_t sksg_dma_address;
225
226 struct fit_completion_entry_v1 completion;
227
228 struct fit_comp_error_info err_info;
229
230};
231#define SKD_DATA_DIR_HOST_TO_CARD 1
232#define SKD_DATA_DIR_CARD_TO_HOST 2
233#define SKD_DATA_DIR_NONE 3 /* especially for DISCARD requests. */
234
235struct skd_special_context {
236 struct skd_request_context req;
237
238 u8 orphaned;
239
240 void *data_buf;
241 dma_addr_t db_dma_address;
242
243 u8 *msg_buf;
244 dma_addr_t mb_dma_address;
245};
246
247struct skd_sg_io {
248 fmode_t mode;
249 void __user *argp;
250
251 struct sg_io_hdr sg;
252
253 u8 cdb[16];
254
255 u32 dxfer_len;
256 u32 iovcnt;
257 struct sg_iovec *iov;
258 struct sg_iovec no_iov_iov;
259
260 struct skd_special_context *skspcl;
261};
262
263typedef enum skd_irq_type {
264 SKD_IRQ_LEGACY,
265 SKD_IRQ_MSI,
266 SKD_IRQ_MSIX
267} skd_irq_type_t;
268
269#define SKD_MAX_BARS 2
270
271struct skd_device {
272 volatile void __iomem *mem_map[SKD_MAX_BARS];
273 resource_size_t mem_phys[SKD_MAX_BARS];
274 u32 mem_size[SKD_MAX_BARS];
275
276 skd_irq_type_t irq_type;
277 u32 msix_count;
278 struct skd_msix_entry *msix_entries;
279
280 struct pci_dev *pdev;
281 int pcie_error_reporting_is_enabled;
282
283 spinlock_t lock;
284 struct gendisk *disk;
285 struct request_queue *queue;
286 struct device *class_dev;
287 int gendisk_on;
288 int sync_done;
289
290 atomic_t device_count;
291 u32 devno;
292 u32 major;
293 char name[32];
294 char isr_name[30];
295
296 enum skd_drvr_state state;
297 u32 drive_state;
298
299 u32 in_flight;
300 u32 cur_max_queue_depth;
301 u32 queue_low_water_mark;
302 u32 dev_max_queue_depth;
303
304 u32 num_fitmsg_context;
305 u32 num_req_context;
306
307 u32 timeout_slot[SKD_N_TIMEOUT_SLOT];
308 u32 timeout_stamp;
309 struct skd_fitmsg_context *skmsg_free_list;
310 struct skd_fitmsg_context *skmsg_table;
311
312 struct skd_request_context *skreq_free_list;
313 struct skd_request_context *skreq_table;
314
315 struct skd_special_context *skspcl_free_list;
316 struct skd_special_context *skspcl_table;
317
318 struct skd_special_context internal_skspcl;
319 u32 read_cap_blocksize;
320 u32 read_cap_last_lba;
321 int read_cap_is_valid;
322 int inquiry_is_valid;
323 u8 inq_serial_num[13]; /*12 chars plus null term */
324 u8 id_str[80]; /* holds a composite name (pci + sernum) */
325
326 u8 skcomp_cycle;
327 u32 skcomp_ix;
328 struct fit_completion_entry_v1 *skcomp_table;
329 struct fit_comp_error_info *skerr_table;
330 dma_addr_t cq_dma_address;
331
332 wait_queue_head_t waitq;
333
334 struct timer_list timer;
335 u32 timer_countdown;
336 u32 timer_substate;
337
338 int n_special;
339 int sgs_per_request;
340 u32 last_mtd;
341
342 u32 proto_ver;
343
344 int dbg_level;
345 u32 connect_time_stamp;
346 int connect_retries;
347#define SKD_MAX_CONNECT_RETRIES 16
348 u32 drive_jiffies;
349
350 u32 timo_slot;
351
352
353 struct work_struct completion_worker;
354};
355
356#define SKD_WRITEL(DEV, VAL, OFF) skd_reg_write32(DEV, VAL, OFF)
357#define SKD_READL(DEV, OFF) skd_reg_read32(DEV, OFF)
358#define SKD_WRITEQ(DEV, VAL, OFF) skd_reg_write64(DEV, VAL, OFF)
359
360static inline u32 skd_reg_read32(struct skd_device *skdev, u32 offset)
361{
362 u32 val;
363
364 if (likely(skdev->dbg_level < 2))
365 return readl(skdev->mem_map[1] + offset);
366 else {
367 barrier();
368 val = readl(skdev->mem_map[1] + offset);
369 barrier();
370 pr_debug("%s:%s:%d offset %x = %x\n",
371 skdev->name, __func__, __LINE__, offset, val);
372 return val;
373 }
374
375}
376
377static inline void skd_reg_write32(struct skd_device *skdev, u32 val,
378 u32 offset)
379{
380 if (likely(skdev->dbg_level < 2)) {
381 writel(val, skdev->mem_map[1] + offset);
382 barrier();
383 } else {
384 barrier();
385 writel(val, skdev->mem_map[1] + offset);
386 barrier();
387 pr_debug("%s:%s:%d offset %x = %x\n",
388 skdev->name, __func__, __LINE__, offset, val);
389 }
390}
391
392static inline void skd_reg_write64(struct skd_device *skdev, u64 val,
393 u32 offset)
394{
395 if (likely(skdev->dbg_level < 2)) {
396 writeq(val, skdev->mem_map[1] + offset);
397 barrier();
398 } else {
399 barrier();
400 writeq(val, skdev->mem_map[1] + offset);
401 barrier();
402 pr_debug("%s:%s:%d offset %x = %016llx\n",
403 skdev->name, __func__, __LINE__, offset, val);
404 }
405}
406
407
408#define SKD_IRQ_DEFAULT SKD_IRQ_MSI
409static int skd_isr_type = SKD_IRQ_DEFAULT;
410
411module_param(skd_isr_type, int, 0444);
412MODULE_PARM_DESC(skd_isr_type, "Interrupt type capability."
413 " (0==legacy, 1==MSI, 2==MSI-X, default==1)");
414
415#define SKD_MAX_REQ_PER_MSG_DEFAULT 1
416static int skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT;
417
418module_param(skd_max_req_per_msg, int, 0444);
419MODULE_PARM_DESC(skd_max_req_per_msg,
420 "Maximum SCSI requests packed in a single message."
421 " (1-14, default==1)");
422
423#define SKD_MAX_QUEUE_DEPTH_DEFAULT 64
424#define SKD_MAX_QUEUE_DEPTH_DEFAULT_STR "64"
425static int skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT;
426
427module_param(skd_max_queue_depth, int, 0444);
428MODULE_PARM_DESC(skd_max_queue_depth,
429 "Maximum SCSI requests issued to s1120."
430 " (1-200, default==" SKD_MAX_QUEUE_DEPTH_DEFAULT_STR ")");
431
432static int skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT;
433module_param(skd_sgs_per_request, int, 0444);
434MODULE_PARM_DESC(skd_sgs_per_request,
435 "Maximum SG elements per block request."
436 " (1-4096, default==256)");
437
438static int skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT;
439module_param(skd_max_pass_thru, int, 0444);
440MODULE_PARM_DESC(skd_max_pass_thru,
441 "Maximum SCSI pass-thru at a time." " (1-50, default==32)");
442
443module_param(skd_dbg_level, int, 0444);
444MODULE_PARM_DESC(skd_dbg_level, "s1120 debug level (0,1,2)");
445
446module_param(skd_isr_comp_limit, int, 0444);
447MODULE_PARM_DESC(skd_isr_comp_limit, "s1120 isr comp limit (0=none) default=4");
448
449/* Major device number dynamically assigned. */
450static u32 skd_major;
451
452static void skd_destruct(struct skd_device *skdev);
453static const struct block_device_operations skd_blockdev_ops;
454static void skd_send_fitmsg(struct skd_device *skdev,
455 struct skd_fitmsg_context *skmsg);
456static void skd_send_special_fitmsg(struct skd_device *skdev,
457 struct skd_special_context *skspcl);
458static void skd_request_fn(struct request_queue *rq);
459static void skd_end_request(struct skd_device *skdev,
460 struct skd_request_context *skreq, int error);
461static int skd_preop_sg_list(struct skd_device *skdev,
462 struct skd_request_context *skreq);
463static void skd_postop_sg_list(struct skd_device *skdev,
464 struct skd_request_context *skreq);
465
466static void skd_restart_device(struct skd_device *skdev);
467static int skd_quiesce_dev(struct skd_device *skdev);
468static int skd_unquiesce_dev(struct skd_device *skdev);
469static void skd_release_special(struct skd_device *skdev,
470 struct skd_special_context *skspcl);
471static void skd_disable_interrupts(struct skd_device *skdev);
472static void skd_isr_fwstate(struct skd_device *skdev);
473static void skd_recover_requests(struct skd_device *skdev, int requeue);
474static void skd_soft_reset(struct skd_device *skdev);
475
476static const char *skd_name(struct skd_device *skdev);
477const char *skd_drive_state_to_str(int state);
478const char *skd_skdev_state_to_str(enum skd_drvr_state state);
479static void skd_log_skdev(struct skd_device *skdev, const char *event);
480static void skd_log_skmsg(struct skd_device *skdev,
481 struct skd_fitmsg_context *skmsg, const char *event);
482static void skd_log_skreq(struct skd_device *skdev,
483 struct skd_request_context *skreq, const char *event);
484
485/*
486 *****************************************************************************
487 * READ/WRITE REQUESTS
488 *****************************************************************************
489 */
490static void skd_fail_all_pending(struct skd_device *skdev)
491{
492 struct request_queue *q = skdev->queue;
493 struct request *req;
494
495 for (;; ) {
496 req = blk_peek_request(q);
497 if (req == NULL)
498 break;
499 blk_start_request(req);
500 __blk_end_request_all(req, -EIO);
501 }
502}
503
504static void
505skd_prep_rw_cdb(struct skd_scsi_request *scsi_req,
506 int data_dir, unsigned lba,
507 unsigned count)
508{
509 if (data_dir == READ)
510 scsi_req->cdb[0] = 0x28;
511 else
512 scsi_req->cdb[0] = 0x2a;
513
514 scsi_req->cdb[1] = 0;
515 scsi_req->cdb[2] = (lba & 0xff000000) >> 24;
516 scsi_req->cdb[3] = (lba & 0xff0000) >> 16;
517 scsi_req->cdb[4] = (lba & 0xff00) >> 8;
518 scsi_req->cdb[5] = (lba & 0xff);
519 scsi_req->cdb[6] = 0;
520 scsi_req->cdb[7] = (count & 0xff00) >> 8;
521 scsi_req->cdb[8] = count & 0xff;
522 scsi_req->cdb[9] = 0;
523}
524
525static void
526skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req,
527 struct skd_request_context *skreq)
528{
529 skreq->flush_cmd = 1;
530
531 scsi_req->cdb[0] = 0x35;
532 scsi_req->cdb[1] = 0;
533 scsi_req->cdb[2] = 0;
534 scsi_req->cdb[3] = 0;
535 scsi_req->cdb[4] = 0;
536 scsi_req->cdb[5] = 0;
537 scsi_req->cdb[6] = 0;
538 scsi_req->cdb[7] = 0;
539 scsi_req->cdb[8] = 0;
540 scsi_req->cdb[9] = 0;
541}
542
543static void
544skd_prep_discard_cdb(struct skd_scsi_request *scsi_req,
545 struct skd_request_context *skreq,
546 struct page *page,
547 u32 lba, u32 count)
548{
549 char *buf;
550 unsigned long len;
551 struct request *req;
552
553 buf = page_address(page);
554 len = SKD_DISCARD_CDB_LENGTH;
555
556 scsi_req->cdb[0] = UNMAP;
557 scsi_req->cdb[8] = len;
558
559 put_unaligned_be16(6 + 16, &buf[0]);
560 put_unaligned_be16(16, &buf[2]);
561 put_unaligned_be64(lba, &buf[8]);
562 put_unaligned_be32(count, &buf[16]);
563
564 req = skreq->req;
565 blk_add_request_payload(req, page, len);
566 req->buffer = buf;
567}
568
569static void skd_request_fn_not_online(struct request_queue *q);
570
571static void skd_request_fn(struct request_queue *q)
572{
573 struct skd_device *skdev = q->queuedata;
574 struct skd_fitmsg_context *skmsg = NULL;
575 struct fit_msg_hdr *fmh = NULL;
576 struct skd_request_context *skreq;
577 struct request *req = NULL;
578 struct skd_scsi_request *scsi_req;
579 struct page *page;
580 unsigned long io_flags;
581 int error;
582 u32 lba;
583 u32 count;
584 int data_dir;
585 u32 be_lba;
586 u32 be_count;
587 u64 be_dmaa;
588 u64 cmdctxt;
589 u32 timo_slot;
590 void *cmd_ptr;
591 int flush, fua;
592
593 if (skdev->state != SKD_DRVR_STATE_ONLINE) {
594 skd_request_fn_not_online(q);
595 return;
596 }
597
598 if (blk_queue_stopped(skdev->queue)) {
599 if (skdev->skmsg_free_list == NULL ||
600 skdev->skreq_free_list == NULL ||
601 skdev->in_flight >= skdev->queue_low_water_mark)
602 /* There is still some kind of shortage */
603 return;
604
605 queue_flag_clear(QUEUE_FLAG_STOPPED, skdev->queue);
606 }
607
608 /*
609 * Stop conditions:
610 * - There are no more native requests
611 * - There are already the maximum number of requests in progress
612 * - There are no more skd_request_context entries
613 * - There are no more FIT msg buffers
614 */
615 for (;; ) {
616
617 flush = fua = 0;
618
619 req = blk_peek_request(q);
620
621 /* Are there any native requests to start? */
622 if (req == NULL)
623 break;
624
625 lba = (u32)blk_rq_pos(req);
626 count = blk_rq_sectors(req);
627 data_dir = rq_data_dir(req);
628 io_flags = req->cmd_flags;
629
630 if (io_flags & REQ_FLUSH)
631 flush++;
632
633 if (io_flags & REQ_FUA)
634 fua++;
635
636 pr_debug("%s:%s:%d new req=%p lba=%u(0x%x) "
637 "count=%u(0x%x) dir=%d\n",
638 skdev->name, __func__, __LINE__,
639 req, lba, lba, count, count, data_dir);
640
641 /* At this point we know there is a request */
642
643 /* Are too many requets already in progress? */
644 if (skdev->in_flight >= skdev->cur_max_queue_depth) {
645 pr_debug("%s:%s:%d qdepth %d, limit %d\n",
646 skdev->name, __func__, __LINE__,
647 skdev->in_flight, skdev->cur_max_queue_depth);
648 break;
649 }
650
651 /* Is a skd_request_context available? */
652 skreq = skdev->skreq_free_list;
653 if (skreq == NULL) {
654 pr_debug("%s:%s:%d Out of req=%p\n",
655 skdev->name, __func__, __LINE__, q);
656 break;
657 }
658 SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE);
659 SKD_ASSERT((skreq->id & SKD_ID_INCR) == 0);
660
661 /* Now we check to see if we can get a fit msg */
662 if (skmsg == NULL) {
663 if (skdev->skmsg_free_list == NULL) {
664 pr_debug("%s:%s:%d Out of msg\n",
665 skdev->name, __func__, __LINE__);
666 break;
667 }
668 }
669
670 skreq->flush_cmd = 0;
671 skreq->n_sg = 0;
672 skreq->sg_byte_count = 0;
673 skreq->discard_page = 0;
674
675 /*
676 * OK to now dequeue request from q.
677 *
678 * At this point we are comitted to either start or reject
679 * the native request. Note that skd_request_context is
680 * available but is still at the head of the free list.
681 */
682 blk_start_request(req);
683 skreq->req = req;
684 skreq->fitmsg_id = 0;
685
686 /* Either a FIT msg is in progress or we have to start one. */
687 if (skmsg == NULL) {
688 /* Are there any FIT msg buffers available? */
689 skmsg = skdev->skmsg_free_list;
690 if (skmsg == NULL) {
691 pr_debug("%s:%s:%d Out of msg skdev=%p\n",
692 skdev->name, __func__, __LINE__,
693 skdev);
694 break;
695 }
696 SKD_ASSERT(skmsg->state == SKD_MSG_STATE_IDLE);
697 SKD_ASSERT((skmsg->id & SKD_ID_INCR) == 0);
698
699 skdev->skmsg_free_list = skmsg->next;
700
701 skmsg->state = SKD_MSG_STATE_BUSY;
702 skmsg->id += SKD_ID_INCR;
703
704 /* Initialize the FIT msg header */
705 fmh = (struct fit_msg_hdr *)skmsg->msg_buf;
706 memset(fmh, 0, sizeof(*fmh));
707 fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT;
708 skmsg->length = sizeof(*fmh);
709 }
710
711 skreq->fitmsg_id = skmsg->id;
712
713 /*
714 * Note that a FIT msg may have just been started
715 * but contains no SoFIT requests yet.
716 */
717
718 /*
719 * Transcode the request, checking as we go. The outcome of
720 * the transcoding is represented by the error variable.
721 */
722 cmd_ptr = &skmsg->msg_buf[skmsg->length];
723 memset(cmd_ptr, 0, 32);
724
725 be_lba = cpu_to_be32(lba);
726 be_count = cpu_to_be32(count);
727 be_dmaa = cpu_to_be64((u64)skreq->sksg_dma_address);
728 cmdctxt = skreq->id + SKD_ID_INCR;
729
730 scsi_req = cmd_ptr;
731 scsi_req->hdr.tag = cmdctxt;
732 scsi_req->hdr.sg_list_dma_address = be_dmaa;
733
734 if (data_dir == READ)
735 skreq->sg_data_dir = SKD_DATA_DIR_CARD_TO_HOST;
736 else
737 skreq->sg_data_dir = SKD_DATA_DIR_HOST_TO_CARD;
738
739 if (io_flags & REQ_DISCARD) {
740 page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
741 if (!page) {
742 pr_err("request_fn:Page allocation failed.\n");
743 skd_end_request(skdev, skreq, -ENOMEM);
744 break;
745 }
746 skreq->discard_page = 1;
747 skd_prep_discard_cdb(scsi_req, skreq, page, lba, count);
748
749 } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) {
750 skd_prep_zerosize_flush_cdb(scsi_req, skreq);
751 SKD_ASSERT(skreq->flush_cmd == 1);
752
753 } else {
754 skd_prep_rw_cdb(scsi_req, data_dir, lba, count);
755 }
756
757 if (fua)
758 scsi_req->cdb[1] |= SKD_FUA_NV;
759
760 if (!req->bio)
761 goto skip_sg;
762
763 error = skd_preop_sg_list(skdev, skreq);
764
765 if (error != 0) {
766 /*
767 * Complete the native request with error.
768 * Note that the request context is still at the
769 * head of the free list, and that the SoFIT request
770 * was encoded into the FIT msg buffer but the FIT
771 * msg length has not been updated. In short, the
772 * only resource that has been allocated but might
773 * not be used is that the FIT msg could be empty.
774 */
775 pr_debug("%s:%s:%d error Out\n",
776 skdev->name, __func__, __LINE__);
777 skd_end_request(skdev, skreq, error);
778 continue;
779 }
780
781skip_sg:
782 scsi_req->hdr.sg_list_len_bytes =
783 cpu_to_be32(skreq->sg_byte_count);
784
785 /* Complete resource allocations. */
786 skdev->skreq_free_list = skreq->next;
787 skreq->state = SKD_REQ_STATE_BUSY;
788 skreq->id += SKD_ID_INCR;
789
790 skmsg->length += sizeof(struct skd_scsi_request);
791 fmh->num_protocol_cmds_coalesced++;
792
793 /*
794 * Update the active request counts.
795 * Capture the timeout timestamp.
796 */
797 skreq->timeout_stamp = skdev->timeout_stamp;
798 timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK;
799 skdev->timeout_slot[timo_slot]++;
800 skdev->in_flight++;
801 pr_debug("%s:%s:%d req=0x%x busy=%d\n",
802 skdev->name, __func__, __LINE__,
803 skreq->id, skdev->in_flight);
804
805 /*
806 * If the FIT msg buffer is full send it.
807 */
808 if (skmsg->length >= SKD_N_FITMSG_BYTES ||
809 fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) {
810 skd_send_fitmsg(skdev, skmsg);
811 skmsg = NULL;
812 fmh = NULL;
813 }
814 }
815
816 /*
817 * Is a FIT msg in progress? If it is empty put the buffer back
818 * on the free list. If it is non-empty send what we got.
819 * This minimizes latency when there are fewer requests than
820 * what fits in a FIT msg.
821 */
822 if (skmsg != NULL) {
823 /* Bigger than just a FIT msg header? */
824 if (skmsg->length > sizeof(struct fit_msg_hdr)) {
825 pr_debug("%s:%s:%d sending msg=%p, len %d\n",
826 skdev->name, __func__, __LINE__,
827 skmsg, skmsg->length);
828 skd_send_fitmsg(skdev, skmsg);
829 } else {
830 /*
831 * The FIT msg is empty. It means we got started
832 * on the msg, but the requests were rejected.
833 */
834 skmsg->state = SKD_MSG_STATE_IDLE;
835 skmsg->id += SKD_ID_INCR;
836 skmsg->next = skdev->skmsg_free_list;
837 skdev->skmsg_free_list = skmsg;
838 }
839 skmsg = NULL;
840 fmh = NULL;
841 }
842
843 /*
844 * If req is non-NULL it means there is something to do but
845 * we are out of a resource.
846 */
847 if (req)
848 blk_stop_queue(skdev->queue);
849}
850
851static void skd_end_request(struct skd_device *skdev,
852 struct skd_request_context *skreq, int error)
853{
854 struct request *req = skreq->req;
855 unsigned int io_flags = req->cmd_flags;
856
857 if ((io_flags & REQ_DISCARD) &&
858 (skreq->discard_page == 1)) {
859 pr_debug("%s:%s:%d, free the page!",
860 skdev->name, __func__, __LINE__);
861 free_page((unsigned long)req->buffer);
862 req->buffer = NULL;
863 }
864
865 if (unlikely(error)) {
866 struct request *req = skreq->req;
867 char *cmd = (rq_data_dir(req) == READ) ? "read" : "write";
868 u32 lba = (u32)blk_rq_pos(req);
869 u32 count = blk_rq_sectors(req);
870
871 pr_err("(%s): Error cmd=%s sect=%u count=%u id=0x%x\n",
872 skd_name(skdev), cmd, lba, count, skreq->id);
873 } else
874 pr_debug("%s:%s:%d id=0x%x error=%d\n",
875 skdev->name, __func__, __LINE__, skreq->id, error);
876
877 __blk_end_request_all(skreq->req, error);
878}
879
880static int skd_preop_sg_list(struct skd_device *skdev,
881 struct skd_request_context *skreq)
882{
883 struct request *req = skreq->req;
884 int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD;
885 int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE;
886 struct scatterlist *sg = &skreq->sg[0];
887 int n_sg;
888 int i;
889
890 skreq->sg_byte_count = 0;
891
892 /* SKD_ASSERT(skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD ||
893 skreq->sg_data_dir == SKD_DATA_DIR_CARD_TO_HOST); */
894
895 n_sg = blk_rq_map_sg(skdev->queue, req, sg);
896 if (n_sg <= 0)
897 return -EINVAL;
898
899 /*
900 * Map scatterlist to PCI bus addresses.
901 * Note PCI might change the number of entries.
902 */
903 n_sg = pci_map_sg(skdev->pdev, sg, n_sg, pci_dir);
904 if (n_sg <= 0)
905 return -EINVAL;
906
907 SKD_ASSERT(n_sg <= skdev->sgs_per_request);
908
909 skreq->n_sg = n_sg;
910
911 for (i = 0; i < n_sg; i++) {
912 struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
913 u32 cnt = sg_dma_len(&sg[i]);
914 uint64_t dma_addr = sg_dma_address(&sg[i]);
915
916 sgd->control = FIT_SGD_CONTROL_NOT_LAST;
917 sgd->byte_count = cnt;
918 skreq->sg_byte_count += cnt;
919 sgd->host_side_addr = dma_addr;
920 sgd->dev_side_addr = 0;
921 }
922
923 skreq->sksg_list[n_sg - 1].next_desc_ptr = 0LL;
924 skreq->sksg_list[n_sg - 1].control = FIT_SGD_CONTROL_LAST;
925
926 if (unlikely(skdev->dbg_level > 1)) {
927 pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n",
928 skdev->name, __func__, __LINE__,
929 skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
930 for (i = 0; i < n_sg; i++) {
931 struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
932 pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x "
933 "addr=0x%llx next=0x%llx\n",
934 skdev->name, __func__, __LINE__,
935 i, sgd->byte_count, sgd->control,
936 sgd->host_side_addr, sgd->next_desc_ptr);
937 }
938 }
939
940 return 0;
941}
942
943static void skd_postop_sg_list(struct skd_device *skdev,
944 struct skd_request_context *skreq)
945{
946 int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD;
947 int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE;
948
949 /*
950 * restore the next ptr for next IO request so we
951 * don't have to set it every time.
952 */
953 skreq->sksg_list[skreq->n_sg - 1].next_desc_ptr =
954 skreq->sksg_dma_address +
955 ((skreq->n_sg) * sizeof(struct fit_sg_descriptor));
956 pci_unmap_sg(skdev->pdev, &skreq->sg[0], skreq->n_sg, pci_dir);
957}
958
959static void skd_request_fn_not_online(struct request_queue *q)
960{
961 struct skd_device *skdev = q->queuedata;
962 int error;
963
964 SKD_ASSERT(skdev->state != SKD_DRVR_STATE_ONLINE);
965
966 skd_log_skdev(skdev, "req_not_online");
967 switch (skdev->state) {
968 case SKD_DRVR_STATE_PAUSING:
969 case SKD_DRVR_STATE_PAUSED:
970 case SKD_DRVR_STATE_STARTING:
971 case SKD_DRVR_STATE_RESTARTING:
972 case SKD_DRVR_STATE_WAIT_BOOT:
973 /* In case of starting, we haven't started the queue,
974 * so we can't get here... but requests are
975 * possibly hanging out waiting for us because we
976 * reported the dev/skd0 already. They'll wait
977 * forever if connect doesn't complete.
978 * What to do??? delay dev/skd0 ??
979 */
980 case SKD_DRVR_STATE_BUSY:
981 case SKD_DRVR_STATE_BUSY_IMMINENT:
982 case SKD_DRVR_STATE_BUSY_ERASE:
983 case SKD_DRVR_STATE_DRAINING_TIMEOUT:
984 return;
985
986 case SKD_DRVR_STATE_BUSY_SANITIZE:
987 case SKD_DRVR_STATE_STOPPING:
988 case SKD_DRVR_STATE_SYNCING:
989 case SKD_DRVR_STATE_FAULT:
990 case SKD_DRVR_STATE_DISAPPEARED:
991 default:
992 error = -EIO;
993 break;
994 }
995
996 /* If we get here, terminate all pending block requeusts
997 * with EIO and any scsi pass thru with appropriate sense
998 */
999
1000 skd_fail_all_pending(skdev);
1001}
1002
1003/*
1004 *****************************************************************************
1005 * TIMER
1006 *****************************************************************************
1007 */
1008
1009static void skd_timer_tick_not_online(struct skd_device *skdev);
1010
1011static void skd_timer_tick(ulong arg)
1012{
1013 struct skd_device *skdev = (struct skd_device *)arg;
1014
1015 u32 timo_slot;
1016 u32 overdue_timestamp;
1017 unsigned long reqflags;
1018 u32 state;
1019
1020 if (skdev->state == SKD_DRVR_STATE_FAULT)
1021 /* The driver has declared fault, and we want it to
1022 * stay that way until driver is reloaded.
1023 */
1024 return;
1025
1026 spin_lock_irqsave(&skdev->lock, reqflags);
1027
1028 state = SKD_READL(skdev, FIT_STATUS);
1029 state &= FIT_SR_DRIVE_STATE_MASK;
1030 if (state != skdev->drive_state)
1031 skd_isr_fwstate(skdev);
1032
1033 if (skdev->state != SKD_DRVR_STATE_ONLINE) {
1034 skd_timer_tick_not_online(skdev);
1035 goto timer_func_out;
1036 }
1037 skdev->timeout_stamp++;
1038 timo_slot = skdev->timeout_stamp & SKD_TIMEOUT_SLOT_MASK;
1039
1040 /*
1041 * All requests that happened during the previous use of
1042 * this slot should be done by now. The previous use was
1043 * over 7 seconds ago.
1044 */
1045 if (skdev->timeout_slot[timo_slot] == 0)
1046 goto timer_func_out;
1047
1048 /* Something is overdue */
1049 overdue_timestamp = skdev->timeout_stamp - SKD_N_TIMEOUT_SLOT;
1050
1051 pr_debug("%s:%s:%d found %d timeouts, draining busy=%d\n",
1052 skdev->name, __func__, __LINE__,
1053 skdev->timeout_slot[timo_slot], skdev->in_flight);
1054 pr_err("(%s): Overdue IOs (%d), busy %d\n",
1055 skd_name(skdev), skdev->timeout_slot[timo_slot],
1056 skdev->in_flight);
1057
1058 skdev->timer_countdown = SKD_DRAINING_TIMO;
1059 skdev->state = SKD_DRVR_STATE_DRAINING_TIMEOUT;
1060 skdev->timo_slot = timo_slot;
1061 blk_stop_queue(skdev->queue);
1062
1063timer_func_out:
1064 mod_timer(&skdev->timer, (jiffies + HZ));
1065
1066 spin_unlock_irqrestore(&skdev->lock, reqflags);
1067}
1068
1069static void skd_timer_tick_not_online(struct skd_device *skdev)
1070{
1071 switch (skdev->state) {
1072 case SKD_DRVR_STATE_IDLE:
1073 case SKD_DRVR_STATE_LOAD:
1074 break;
1075 case SKD_DRVR_STATE_BUSY_SANITIZE:
1076 pr_debug("%s:%s:%d drive busy sanitize[%x], driver[%x]\n",
1077 skdev->name, __func__, __LINE__,
1078 skdev->drive_state, skdev->state);
1079 /* If we've been in sanitize for 3 seconds, we figure we're not
1080 * going to get anymore completions, so recover requests now
1081 */
1082 if (skdev->timer_countdown > 0) {
1083 skdev->timer_countdown--;
1084 return;
1085 }
1086 skd_recover_requests(skdev, 0);
1087 break;
1088
1089 case SKD_DRVR_STATE_BUSY:
1090 case SKD_DRVR_STATE_BUSY_IMMINENT:
1091 case SKD_DRVR_STATE_BUSY_ERASE:
1092 pr_debug("%s:%s:%d busy[%x], countdown=%d\n",
1093 skdev->name, __func__, __LINE__,
1094 skdev->state, skdev->timer_countdown);
1095 if (skdev->timer_countdown > 0) {
1096 skdev->timer_countdown--;
1097 return;
1098 }
1099 pr_debug("%s:%s:%d busy[%x], timedout=%d, restarting device.",
1100 skdev->name, __func__, __LINE__,
1101 skdev->state, skdev->timer_countdown);
1102 skd_restart_device(skdev);
1103 break;
1104
1105 case SKD_DRVR_STATE_WAIT_BOOT:
1106 case SKD_DRVR_STATE_STARTING:
1107 if (skdev->timer_countdown > 0) {
1108 skdev->timer_countdown--;
1109 return;
1110 }
1111 /* For now, we fault the drive. Could attempt resets to
1112 * revcover at some point. */
1113 skdev->state = SKD_DRVR_STATE_FAULT;
1114
1115 pr_err("(%s): DriveFault Connect Timeout (%x)\n",
1116 skd_name(skdev), skdev->drive_state);
1117
1118 /*start the queue so we can respond with error to requests */
1119 /* wakeup anyone waiting for startup complete */
1120 blk_start_queue(skdev->queue);
1121 skdev->gendisk_on = -1;
1122 wake_up_interruptible(&skdev->waitq);
1123 break;
1124
1125 case SKD_DRVR_STATE_ONLINE:
1126 /* shouldn't get here. */
1127 break;
1128
1129 case SKD_DRVR_STATE_PAUSING:
1130 case SKD_DRVR_STATE_PAUSED:
1131 break;
1132
1133 case SKD_DRVR_STATE_DRAINING_TIMEOUT:
1134 pr_debug("%s:%s:%d "
1135 "draining busy [%d] tick[%d] qdb[%d] tmls[%d]\n",
1136 skdev->name, __func__, __LINE__,
1137 skdev->timo_slot,
1138 skdev->timer_countdown,
1139 skdev->in_flight,
1140 skdev->timeout_slot[skdev->timo_slot]);
1141 /* if the slot has cleared we can let the I/O continue */
1142 if (skdev->timeout_slot[skdev->timo_slot] == 0) {
1143 pr_debug("%s:%s:%d Slot drained, starting queue.\n",
1144 skdev->name, __func__, __LINE__);
1145 skdev->state = SKD_DRVR_STATE_ONLINE;
1146 blk_start_queue(skdev->queue);
1147 return;
1148 }
1149 if (skdev->timer_countdown > 0) {
1150 skdev->timer_countdown--;
1151 return;
1152 }
1153 skd_restart_device(skdev);
1154 break;
1155
1156 case SKD_DRVR_STATE_RESTARTING:
1157 if (skdev->timer_countdown > 0) {
1158 skdev->timer_countdown--;
1159 return;
1160 }
1161 /* For now, we fault the drive. Could attempt resets to
1162 * revcover at some point. */
1163 skdev->state = SKD_DRVR_STATE_FAULT;
1164 pr_err("(%s): DriveFault Reconnect Timeout (%x)\n",
1165 skd_name(skdev), skdev->drive_state);
1166
1167 /*
1168 * Recovering does two things:
1169 * 1. completes IO with error
1170 * 2. reclaims dma resources
1171 * When is it safe to recover requests?
1172 * - if the drive state is faulted
1173 * - if the state is still soft reset after out timeout
1174 * - if the drive registers are dead (state = FF)
1175 * If it is "unsafe", we still need to recover, so we will
1176 * disable pci bus mastering and disable our interrupts.
1177 */
1178
1179 if ((skdev->drive_state == FIT_SR_DRIVE_SOFT_RESET) ||
1180 (skdev->drive_state == FIT_SR_DRIVE_FAULT) ||
1181 (skdev->drive_state == FIT_SR_DRIVE_STATE_MASK))
1182 /* It never came out of soft reset. Try to
1183 * recover the requests and then let them
1184 * fail. This is to mitigate hung processes. */
1185 skd_recover_requests(skdev, 0);
1186 else {
1187 pr_err("(%s): Disable BusMaster (%x)\n",
1188 skd_name(skdev), skdev->drive_state);
1189 pci_disable_device(skdev->pdev);
1190 skd_disable_interrupts(skdev);
1191 skd_recover_requests(skdev, 0);
1192 }
1193
1194 /*start the queue so we can respond with error to requests */
1195 /* wakeup anyone waiting for startup complete */
1196 blk_start_queue(skdev->queue);
1197 skdev->gendisk_on = -1;
1198 wake_up_interruptible(&skdev->waitq);
1199 break;
1200
1201 case SKD_DRVR_STATE_RESUMING:
1202 case SKD_DRVR_STATE_STOPPING:
1203 case SKD_DRVR_STATE_SYNCING:
1204 case SKD_DRVR_STATE_FAULT:
1205 case SKD_DRVR_STATE_DISAPPEARED:
1206 default:
1207 break;
1208 }
1209}
1210
1211static int skd_start_timer(struct skd_device *skdev)
1212{
1213 int rc;
1214
1215 init_timer(&skdev->timer);
1216 setup_timer(&skdev->timer, skd_timer_tick, (ulong)skdev);
1217
1218 rc = mod_timer(&skdev->timer, (jiffies + HZ));
1219 if (rc)
1220 pr_err("%s: failed to start timer %d\n",
1221 __func__, rc);
1222 return rc;
1223}
1224
1225static void skd_kill_timer(struct skd_device *skdev)
1226{
1227 del_timer_sync(&skdev->timer);
1228}
1229
1230/*
1231 *****************************************************************************
1232 * IOCTL
1233 *****************************************************************************
1234 */
1235static int skd_ioctl_sg_io(struct skd_device *skdev,
1236 fmode_t mode, void __user *argp);
1237static int skd_sg_io_get_and_check_args(struct skd_device *skdev,
1238 struct skd_sg_io *sksgio);
1239static int skd_sg_io_obtain_skspcl(struct skd_device *skdev,
1240 struct skd_sg_io *sksgio);
1241static int skd_sg_io_prep_buffering(struct skd_device *skdev,
1242 struct skd_sg_io *sksgio);
1243static int skd_sg_io_copy_buffer(struct skd_device *skdev,
1244 struct skd_sg_io *sksgio, int dxfer_dir);
1245static int skd_sg_io_send_fitmsg(struct skd_device *skdev,
1246 struct skd_sg_io *sksgio);
1247static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio);
1248static int skd_sg_io_release_skspcl(struct skd_device *skdev,
1249 struct skd_sg_io *sksgio);
1250static int skd_sg_io_put_status(struct skd_device *skdev,
1251 struct skd_sg_io *sksgio);
1252
1253static void skd_complete_special(struct skd_device *skdev,
1254 volatile struct fit_completion_entry_v1
1255 *skcomp,
1256 volatile struct fit_comp_error_info *skerr,
1257 struct skd_special_context *skspcl);
1258
1259static int skd_bdev_ioctl(struct block_device *bdev, fmode_t mode,
1260 uint cmd_in, ulong arg)
1261{
1262 int rc = 0;
1263 struct gendisk *disk = bdev->bd_disk;
1264 struct skd_device *skdev = disk->private_data;
1265 void __user *p = (void *)arg;
1266
1267 pr_debug("%s:%s:%d %s: CMD[%s] ioctl mode 0x%x, cmd 0x%x arg %0lx\n",
1268 skdev->name, __func__, __LINE__,
1269 disk->disk_name, current->comm, mode, cmd_in, arg);
1270
1271 if (!capable(CAP_SYS_ADMIN))
1272 return -EPERM;
1273
1274 switch (cmd_in) {
1275 case SG_SET_TIMEOUT:
1276 case SG_GET_TIMEOUT:
1277 case SG_GET_VERSION_NUM:
1278 rc = scsi_cmd_ioctl(disk->queue, disk, mode, cmd_in, p);
1279 break;
1280 case SG_IO:
1281 rc = skd_ioctl_sg_io(skdev, mode, p);
1282 break;
1283
1284 default:
1285 rc = -ENOTTY;
1286 break;
1287 }
1288
1289 pr_debug("%s:%s:%d %s: completion rc %d\n",
1290 skdev->name, __func__, __LINE__, disk->disk_name, rc);
1291 return rc;
1292}
1293
1294static int skd_ioctl_sg_io(struct skd_device *skdev, fmode_t mode,
1295 void __user *argp)
1296{
1297 int rc;
1298 struct skd_sg_io sksgio;
1299
1300 memset(&sksgio, 0, sizeof(sksgio));
1301 sksgio.mode = mode;
1302 sksgio.argp = argp;
1303 sksgio.iov = &sksgio.no_iov_iov;
1304
1305 switch (skdev->state) {
1306 case SKD_DRVR_STATE_ONLINE:
1307 case SKD_DRVR_STATE_BUSY_IMMINENT:
1308 break;
1309
1310 default:
1311 pr_debug("%s:%s:%d drive not online\n",
1312 skdev->name, __func__, __LINE__);
1313 rc = -ENXIO;
1314 goto out;
1315 }
1316
1317 rc = skd_sg_io_get_and_check_args(skdev, &sksgio);
1318 if (rc)
1319 goto out;
1320
1321 rc = skd_sg_io_obtain_skspcl(skdev, &sksgio);
1322 if (rc)
1323 goto out;
1324
1325 rc = skd_sg_io_prep_buffering(skdev, &sksgio);
1326 if (rc)
1327 goto out;
1328
1329 rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_TO_DEV);
1330 if (rc)
1331 goto out;
1332
1333 rc = skd_sg_io_send_fitmsg(skdev, &sksgio);
1334 if (rc)
1335 goto out;
1336
1337 rc = skd_sg_io_await(skdev, &sksgio);
1338 if (rc)
1339 goto out;
1340
1341 rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_FROM_DEV);
1342 if (rc)
1343 goto out;
1344
1345 rc = skd_sg_io_put_status(skdev, &sksgio);
1346 if (rc)
1347 goto out;
1348
1349 rc = 0;
1350
1351out:
1352 skd_sg_io_release_skspcl(skdev, &sksgio);
1353
1354 if (sksgio.iov != NULL && sksgio.iov != &sksgio.no_iov_iov)
1355 kfree(sksgio.iov);
1356 return rc;
1357}
1358
1359static int skd_sg_io_get_and_check_args(struct skd_device *skdev,
1360 struct skd_sg_io *sksgio)
1361{
1362 struct sg_io_hdr *sgp = &sksgio->sg;
1363 int i, acc;
1364
1365 if (!access_ok(VERIFY_WRITE, sksgio->argp, sizeof(sg_io_hdr_t))) {
1366 pr_debug("%s:%s:%d access sg failed %p\n",
1367 skdev->name, __func__, __LINE__, sksgio->argp);
1368 return -EFAULT;
1369 }
1370
1371 if (__copy_from_user(sgp, sksgio->argp, sizeof(sg_io_hdr_t))) {
1372 pr_debug("%s:%s:%d copy_from_user sg failed %p\n",
1373 skdev->name, __func__, __LINE__, sksgio->argp);
1374 return -EFAULT;
1375 }
1376
1377 if (sgp->interface_id != SG_INTERFACE_ID_ORIG) {
1378 pr_debug("%s:%s:%d interface_id invalid 0x%x\n",
1379 skdev->name, __func__, __LINE__, sgp->interface_id);
1380 return -EINVAL;
1381 }
1382
1383 if (sgp->cmd_len > sizeof(sksgio->cdb)) {
1384 pr_debug("%s:%s:%d cmd_len invalid %d\n",
1385 skdev->name, __func__, __LINE__, sgp->cmd_len);
1386 return -EINVAL;
1387 }
1388
1389 if (sgp->iovec_count > 256) {
1390 pr_debug("%s:%s:%d iovec_count invalid %d\n",
1391 skdev->name, __func__, __LINE__, sgp->iovec_count);
1392 return -EINVAL;
1393 }
1394
1395 if (sgp->dxfer_len > (PAGE_SIZE * SKD_N_SG_PER_SPECIAL)) {
1396 pr_debug("%s:%s:%d dxfer_len invalid %d\n",
1397 skdev->name, __func__, __LINE__, sgp->dxfer_len);
1398 return -EINVAL;
1399 }
1400
1401 switch (sgp->dxfer_direction) {
1402 case SG_DXFER_NONE:
1403 acc = -1;
1404 break;
1405
1406 case SG_DXFER_TO_DEV:
1407 acc = VERIFY_READ;
1408 break;
1409
1410 case SG_DXFER_FROM_DEV:
1411 case SG_DXFER_TO_FROM_DEV:
1412 acc = VERIFY_WRITE;
1413 break;
1414
1415 default:
1416 pr_debug("%s:%s:%d dxfer_dir invalid %d\n",
1417 skdev->name, __func__, __LINE__, sgp->dxfer_direction);
1418 return -EINVAL;
1419 }
1420
1421 if (copy_from_user(sksgio->cdb, sgp->cmdp, sgp->cmd_len)) {
1422 pr_debug("%s:%s:%d copy_from_user cmdp failed %p\n",
1423 skdev->name, __func__, __LINE__, sgp->cmdp);
1424 return -EFAULT;
1425 }
1426
1427 if (sgp->mx_sb_len != 0) {
1428 if (!access_ok(VERIFY_WRITE, sgp->sbp, sgp->mx_sb_len)) {
1429 pr_debug("%s:%s:%d access sbp failed %p\n",
1430 skdev->name, __func__, __LINE__, sgp->sbp);
1431 return -EFAULT;
1432 }
1433 }
1434
1435 if (sgp->iovec_count == 0) {
1436 sksgio->iov[0].iov_base = sgp->dxferp;
1437 sksgio->iov[0].iov_len = sgp->dxfer_len;
1438 sksgio->iovcnt = 1;
1439 sksgio->dxfer_len = sgp->dxfer_len;
1440 } else {
1441 struct sg_iovec *iov;
1442 uint nbytes = sizeof(*iov) * sgp->iovec_count;
1443 size_t iov_data_len;
1444
1445 iov = kmalloc(nbytes, GFP_KERNEL);
1446 if (iov == NULL) {
1447 pr_debug("%s:%s:%d alloc iovec failed %d\n",
1448 skdev->name, __func__, __LINE__,
1449 sgp->iovec_count);
1450 return -ENOMEM;
1451 }
1452 sksgio->iov = iov;
1453 sksgio->iovcnt = sgp->iovec_count;
1454
1455 if (copy_from_user(iov, sgp->dxferp, nbytes)) {
1456 pr_debug("%s:%s:%d copy_from_user iovec failed %p\n",
1457 skdev->name, __func__, __LINE__, sgp->dxferp);
1458 return -EFAULT;
1459 }
1460
1461 /*
1462 * Sum up the vecs, making sure they don't overflow
1463 */
1464 iov_data_len = 0;
1465 for (i = 0; i < sgp->iovec_count; i++) {
1466 if (iov_data_len + iov[i].iov_len < iov_data_len)
1467 return -EINVAL;
1468 iov_data_len += iov[i].iov_len;
1469 }
1470
1471 /* SG_IO howto says that the shorter of the two wins */
1472 if (sgp->dxfer_len < iov_data_len) {
1473 sksgio->iovcnt = iov_shorten((struct iovec *)iov,
1474 sgp->iovec_count,
1475 sgp->dxfer_len);
1476 sksgio->dxfer_len = sgp->dxfer_len;
1477 } else
1478 sksgio->dxfer_len = iov_data_len;
1479 }
1480
1481 if (sgp->dxfer_direction != SG_DXFER_NONE) {
1482 struct sg_iovec *iov = sksgio->iov;
1483 for (i = 0; i < sksgio->iovcnt; i++, iov++) {
1484 if (!access_ok(acc, iov->iov_base, iov->iov_len)) {
1485 pr_debug("%s:%s:%d access data failed %p/%d\n",
1486 skdev->name, __func__, __LINE__,
1487 iov->iov_base, (int)iov->iov_len);
1488 return -EFAULT;
1489 }
1490 }
1491 }
1492
1493 return 0;
1494}
1495
1496static int skd_sg_io_obtain_skspcl(struct skd_device *skdev,
1497 struct skd_sg_io *sksgio)
1498{
1499 struct skd_special_context *skspcl = NULL;
1500 int rc;
1501
1502 for (;;) {
1503 ulong flags;
1504
1505 spin_lock_irqsave(&skdev->lock, flags);
1506 skspcl = skdev->skspcl_free_list;
1507 if (skspcl != NULL) {
1508 skdev->skspcl_free_list =
1509 (struct skd_special_context *)skspcl->req.next;
1510 skspcl->req.id += SKD_ID_INCR;
1511 skspcl->req.state = SKD_REQ_STATE_SETUP;
1512 skspcl->orphaned = 0;
1513 skspcl->req.n_sg = 0;
1514 }
1515 spin_unlock_irqrestore(&skdev->lock, flags);
1516
1517 if (skspcl != NULL) {
1518 rc = 0;
1519 break;
1520 }
1521
1522 pr_debug("%s:%s:%d blocking\n",
1523 skdev->name, __func__, __LINE__);
1524
1525 rc = wait_event_interruptible_timeout(
1526 skdev->waitq,
1527 (skdev->skspcl_free_list != NULL),
1528 msecs_to_jiffies(sksgio->sg.timeout));
1529
1530 pr_debug("%s:%s:%d unblocking, rc=%d\n",
1531 skdev->name, __func__, __LINE__, rc);
1532
1533 if (rc <= 0) {
1534 if (rc == 0)
1535 rc = -ETIMEDOUT;
1536 else
1537 rc = -EINTR;
1538 break;
1539 }
1540 /*
1541 * If we get here rc > 0 meaning the timeout to
1542 * wait_event_interruptible_timeout() had time left, hence the
1543 * sought event -- non-empty free list -- happened.
1544 * Retry the allocation.
1545 */
1546 }
1547 sksgio->skspcl = skspcl;
1548
1549 return rc;
1550}
1551
1552static int skd_skreq_prep_buffering(struct skd_device *skdev,
1553 struct skd_request_context *skreq,
1554 u32 dxfer_len)
1555{
1556 u32 resid = dxfer_len;
1557
1558 /*
1559 * The DMA engine must have aligned addresses and byte counts.
1560 */
1561 resid += (-resid) & 3;
1562 skreq->sg_byte_count = resid;
1563
1564 skreq->n_sg = 0;
1565
1566 while (resid > 0) {
1567 u32 nbytes = PAGE_SIZE;
1568 u32 ix = skreq->n_sg;
1569 struct scatterlist *sg = &skreq->sg[ix];
1570 struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix];
1571 struct page *page;
1572
1573 if (nbytes > resid)
1574 nbytes = resid;
1575
1576 page = alloc_page(GFP_KERNEL);
1577 if (page == NULL)
1578 return -ENOMEM;
1579
1580 sg_set_page(sg, page, nbytes, 0);
1581
1582 /* TODO: This should be going through a pci_???()
1583 * routine to do proper mapping. */
1584 sksg->control = FIT_SGD_CONTROL_NOT_LAST;
1585 sksg->byte_count = nbytes;
1586
1587 sksg->host_side_addr = sg_phys(sg);
1588
1589 sksg->dev_side_addr = 0;
1590 sksg->next_desc_ptr = skreq->sksg_dma_address +
1591 (ix + 1) * sizeof(*sksg);
1592
1593 skreq->n_sg++;
1594 resid -= nbytes;
1595 }
1596
1597 if (skreq->n_sg > 0) {
1598 u32 ix = skreq->n_sg - 1;
1599 struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix];
1600
1601 sksg->control = FIT_SGD_CONTROL_LAST;
1602 sksg->next_desc_ptr = 0;
1603 }
1604
1605 if (unlikely(skdev->dbg_level > 1)) {
1606 u32 i;
1607
1608 pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n",
1609 skdev->name, __func__, __LINE__,
1610 skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
1611 for (i = 0; i < skreq->n_sg; i++) {
1612 struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
1613
1614 pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x "
1615 "addr=0x%llx next=0x%llx\n",
1616 skdev->name, __func__, __LINE__,
1617 i, sgd->byte_count, sgd->control,
1618 sgd->host_side_addr, sgd->next_desc_ptr);
1619 }
1620 }
1621
1622 return 0;
1623}
1624
1625static int skd_sg_io_prep_buffering(struct skd_device *skdev,
1626 struct skd_sg_io *sksgio)
1627{
1628 struct skd_special_context *skspcl = sksgio->skspcl;
1629 struct skd_request_context *skreq = &skspcl->req;
1630 u32 dxfer_len = sksgio->dxfer_len;
1631 int rc;
1632
1633 rc = skd_skreq_prep_buffering(skdev, skreq, dxfer_len);
1634 /*
1635 * Eventually, errors or not, skd_release_special() is called
1636 * to recover allocations including partial allocations.
1637 */
1638 return rc;
1639}
1640
1641static int skd_sg_io_copy_buffer(struct skd_device *skdev,
1642 struct skd_sg_io *sksgio, int dxfer_dir)
1643{
1644 struct skd_special_context *skspcl = sksgio->skspcl;
1645 u32 iov_ix = 0;
1646 struct sg_iovec curiov;
1647 u32 sksg_ix = 0;
1648 u8 *bufp = NULL;
1649 u32 buf_len = 0;
1650 u32 resid = sksgio->dxfer_len;
1651 int rc;
1652
1653 curiov.iov_len = 0;
1654 curiov.iov_base = NULL;
1655
1656 if (dxfer_dir != sksgio->sg.dxfer_direction) {
1657 if (dxfer_dir != SG_DXFER_TO_DEV ||
1658 sksgio->sg.dxfer_direction != SG_DXFER_TO_FROM_DEV)
1659 return 0;
1660 }
1661
1662 while (resid > 0) {
1663 u32 nbytes = PAGE_SIZE;
1664
1665 if (curiov.iov_len == 0) {
1666 curiov = sksgio->iov[iov_ix++];
1667 continue;
1668 }
1669
1670 if (buf_len == 0) {
1671 struct page *page;
1672 page = sg_page(&skspcl->req.sg[sksg_ix++]);
1673 bufp = page_address(page);
1674 buf_len = PAGE_SIZE;
1675 }
1676
1677 nbytes = min_t(u32, nbytes, resid);
1678 nbytes = min_t(u32, nbytes, curiov.iov_len);
1679 nbytes = min_t(u32, nbytes, buf_len);
1680
1681 if (dxfer_dir == SG_DXFER_TO_DEV)
1682 rc = __copy_from_user(bufp, curiov.iov_base, nbytes);
1683 else
1684 rc = __copy_to_user(curiov.iov_base, bufp, nbytes);
1685
1686 if (rc)
1687 return -EFAULT;
1688
1689 resid -= nbytes;
1690 curiov.iov_len -= nbytes;
1691 curiov.iov_base += nbytes;
1692 buf_len -= nbytes;
1693 }
1694
1695 return 0;
1696}
1697
1698static int skd_sg_io_send_fitmsg(struct skd_device *skdev,
1699 struct skd_sg_io *sksgio)
1700{
1701 struct skd_special_context *skspcl = sksgio->skspcl;
1702 struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf;
1703 struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1];
1704
1705 memset(skspcl->msg_buf, 0, SKD_N_SPECIAL_FITMSG_BYTES);
1706
1707 /* Initialize the FIT msg header */
1708 fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT;
1709 fmh->num_protocol_cmds_coalesced = 1;
1710
1711 /* Initialize the SCSI request */
1712 if (sksgio->sg.dxfer_direction != SG_DXFER_NONE)
1713 scsi_req->hdr.sg_list_dma_address =
1714 cpu_to_be64(skspcl->req.sksg_dma_address);
1715 scsi_req->hdr.tag = skspcl->req.id;
1716 scsi_req->hdr.sg_list_len_bytes =
1717 cpu_to_be32(skspcl->req.sg_byte_count);
1718 memcpy(scsi_req->cdb, sksgio->cdb, sizeof(scsi_req->cdb));
1719
1720 skspcl->req.state = SKD_REQ_STATE_BUSY;
1721 skd_send_special_fitmsg(skdev, skspcl);
1722
1723 return 0;
1724}
1725
1726static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio)
1727{
1728 unsigned long flags;
1729 int rc;
1730
1731 rc = wait_event_interruptible_timeout(skdev->waitq,
1732 (sksgio->skspcl->req.state !=
1733 SKD_REQ_STATE_BUSY),
1734 msecs_to_jiffies(sksgio->sg.
1735 timeout));
1736
1737 spin_lock_irqsave(&skdev->lock, flags);
1738
1739 if (sksgio->skspcl->req.state == SKD_REQ_STATE_ABORTED) {
1740 pr_debug("%s:%s:%d skspcl %p aborted\n",
1741 skdev->name, __func__, __LINE__, sksgio->skspcl);
1742
1743 /* Build check cond, sense and let command finish. */
1744 /* For a timeout, we must fabricate completion and sense
1745 * data to complete the command */
1746 sksgio->skspcl->req.completion.status =
1747 SAM_STAT_CHECK_CONDITION;
1748
1749 memset(&sksgio->skspcl->req.err_info, 0,
1750 sizeof(sksgio->skspcl->req.err_info));
1751 sksgio->skspcl->req.err_info.type = 0x70;
1752 sksgio->skspcl->req.err_info.key = ABORTED_COMMAND;
1753 sksgio->skspcl->req.err_info.code = 0x44;
1754 sksgio->skspcl->req.err_info.qual = 0;
1755 rc = 0;
1756 } else if (sksgio->skspcl->req.state != SKD_REQ_STATE_BUSY)
1757 /* No longer on the adapter. We finish. */
1758 rc = 0;
1759 else {
1760 /* Something's gone wrong. Still busy. Timeout or
1761 * user interrupted (control-C). Mark as an orphan
1762 * so it will be disposed when completed. */
1763 sksgio->skspcl->orphaned = 1;
1764 sksgio->skspcl = NULL;
1765 if (rc == 0) {
1766 pr_debug("%s:%s:%d timed out %p (%u ms)\n",
1767 skdev->name, __func__, __LINE__,
1768 sksgio, sksgio->sg.timeout);
1769 rc = -ETIMEDOUT;
1770 } else {
1771 pr_debug("%s:%s:%d cntlc %p\n",
1772 skdev->name, __func__, __LINE__, sksgio);
1773 rc = -EINTR;
1774 }
1775 }
1776
1777 spin_unlock_irqrestore(&skdev->lock, flags);
1778
1779 return rc;
1780}
1781
1782static int skd_sg_io_put_status(struct skd_device *skdev,
1783 struct skd_sg_io *sksgio)
1784{
1785 struct sg_io_hdr *sgp = &sksgio->sg;
1786 struct skd_special_context *skspcl = sksgio->skspcl;
1787 int resid = 0;
1788
1789 u32 nb = be32_to_cpu(skspcl->req.completion.num_returned_bytes);
1790
1791 sgp->status = skspcl->req.completion.status;
1792 resid = sksgio->dxfer_len - nb;
1793
1794 sgp->masked_status = sgp->status & STATUS_MASK;
1795 sgp->msg_status = 0;
1796 sgp->host_status = 0;
1797 sgp->driver_status = 0;
1798 sgp->resid = resid;
1799 if (sgp->masked_status || sgp->host_status || sgp->driver_status)
1800 sgp->info |= SG_INFO_CHECK;
1801
1802 pr_debug("%s:%s:%d status %x masked %x resid 0x%x\n",
1803 skdev->name, __func__, __LINE__,
1804 sgp->status, sgp->masked_status, sgp->resid);
1805
1806 if (sgp->masked_status == SAM_STAT_CHECK_CONDITION) {
1807 if (sgp->mx_sb_len > 0) {
1808 struct fit_comp_error_info *ei = &skspcl->req.err_info;
1809 u32 nbytes = sizeof(*ei);
1810
1811 nbytes = min_t(u32, nbytes, sgp->mx_sb_len);
1812
1813 sgp->sb_len_wr = nbytes;
1814
1815 if (__copy_to_user(sgp->sbp, ei, nbytes)) {
1816 pr_debug("%s:%s:%d copy_to_user sense failed %p\n",
1817 skdev->name, __func__, __LINE__,
1818 sgp->sbp);
1819 return -EFAULT;
1820 }
1821 }
1822 }
1823
1824 if (__copy_to_user(sksgio->argp, sgp, sizeof(sg_io_hdr_t))) {
1825 pr_debug("%s:%s:%d copy_to_user sg failed %p\n",
1826 skdev->name, __func__, __LINE__, sksgio->argp);
1827 return -EFAULT;
1828 }
1829
1830 return 0;
1831}
1832
1833static int skd_sg_io_release_skspcl(struct skd_device *skdev,
1834 struct skd_sg_io *sksgio)
1835{
1836 struct skd_special_context *skspcl = sksgio->skspcl;
1837
1838 if (skspcl != NULL) {
1839 ulong flags;
1840
1841 sksgio->skspcl = NULL;
1842
1843 spin_lock_irqsave(&skdev->lock, flags);
1844 skd_release_special(skdev, skspcl);
1845 spin_unlock_irqrestore(&skdev->lock, flags);
1846 }
1847
1848 return 0;
1849}
1850
1851/*
1852 *****************************************************************************
1853 * INTERNAL REQUESTS -- generated by driver itself
1854 *****************************************************************************
1855 */
1856
1857static int skd_format_internal_skspcl(struct skd_device *skdev)
1858{
1859 struct skd_special_context *skspcl = &skdev->internal_skspcl;
1860 struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0];
1861 struct fit_msg_hdr *fmh;
1862 uint64_t dma_address;
1863 struct skd_scsi_request *scsi;
1864
1865 fmh = (struct fit_msg_hdr *)&skspcl->msg_buf[0];
1866 fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT;
1867 fmh->num_protocol_cmds_coalesced = 1;
1868
1869 scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64];
1870 memset(scsi, 0, sizeof(*scsi));
1871 dma_address = skspcl->req.sksg_dma_address;
1872 scsi->hdr.sg_list_dma_address = cpu_to_be64(dma_address);
1873 sgd->control = FIT_SGD_CONTROL_LAST;
1874 sgd->byte_count = 0;
1875 sgd->host_side_addr = skspcl->db_dma_address;
1876 sgd->dev_side_addr = 0;
1877 sgd->next_desc_ptr = 0LL;
1878
1879 return 1;
1880}
1881
1882#define WR_BUF_SIZE SKD_N_INTERNAL_BYTES
1883
1884static void skd_send_internal_skspcl(struct skd_device *skdev,
1885 struct skd_special_context *skspcl,
1886 u8 opcode)
1887{
1888 struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0];
1889 struct skd_scsi_request *scsi;
1890 unsigned char *buf = skspcl->data_buf;
1891 int i;
1892
1893 if (skspcl->req.state != SKD_REQ_STATE_IDLE)
1894 /*
1895 * A refresh is already in progress.
1896 * Just wait for it to finish.
1897 */
1898 return;
1899
1900 SKD_ASSERT((skspcl->req.id & SKD_ID_INCR) == 0);
1901 skspcl->req.state = SKD_REQ_STATE_BUSY;
1902 skspcl->req.id += SKD_ID_INCR;
1903
1904 scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64];
1905 scsi->hdr.tag = skspcl->req.id;
1906
1907 memset(scsi->cdb, 0, sizeof(scsi->cdb));
1908
1909 switch (opcode) {
1910 case TEST_UNIT_READY:
1911 scsi->cdb[0] = TEST_UNIT_READY;
1912 sgd->byte_count = 0;
1913 scsi->hdr.sg_list_len_bytes = 0;
1914 break;
1915
1916 case READ_CAPACITY:
1917 scsi->cdb[0] = READ_CAPACITY;
1918 sgd->byte_count = SKD_N_READ_CAP_BYTES;
1919 scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
1920 break;
1921
1922 case INQUIRY:
1923 scsi->cdb[0] = INQUIRY;
1924 scsi->cdb[1] = 0x01; /* evpd */
1925 scsi->cdb[2] = 0x80; /* serial number page */
1926 scsi->cdb[4] = 0x10;
1927 sgd->byte_count = 16;
1928 scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
1929 break;
1930
1931 case SYNCHRONIZE_CACHE:
1932 scsi->cdb[0] = SYNCHRONIZE_CACHE;
1933 sgd->byte_count = 0;
1934 scsi->hdr.sg_list_len_bytes = 0;
1935 break;
1936
1937 case WRITE_BUFFER:
1938 scsi->cdb[0] = WRITE_BUFFER;
1939 scsi->cdb[1] = 0x02;
1940 scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8;
1941 scsi->cdb[8] = WR_BUF_SIZE & 0xFF;
1942 sgd->byte_count = WR_BUF_SIZE;
1943 scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
1944 /* fill incrementing byte pattern */
1945 for (i = 0; i < sgd->byte_count; i++)
1946 buf[i] = i & 0xFF;
1947 break;
1948
1949 case READ_BUFFER:
1950 scsi->cdb[0] = READ_BUFFER;
1951 scsi->cdb[1] = 0x02;
1952 scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8;
1953 scsi->cdb[8] = WR_BUF_SIZE & 0xFF;
1954 sgd->byte_count = WR_BUF_SIZE;
1955 scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
1956 memset(skspcl->data_buf, 0, sgd->byte_count);
1957 break;
1958
1959 default:
1960 SKD_ASSERT("Don't know what to send");
1961 return;
1962
1963 }
1964 skd_send_special_fitmsg(skdev, skspcl);
1965}
1966
1967static void skd_refresh_device_data(struct skd_device *skdev)
1968{
1969 struct skd_special_context *skspcl = &skdev->internal_skspcl;
1970
1971 skd_send_internal_skspcl(skdev, skspcl, TEST_UNIT_READY);
1972}
1973
1974static int skd_chk_read_buf(struct skd_device *skdev,
1975 struct skd_special_context *skspcl)
1976{
1977 unsigned char *buf = skspcl->data_buf;
1978 int i;
1979
1980 /* check for incrementing byte pattern */
1981 for (i = 0; i < WR_BUF_SIZE; i++)
1982 if (buf[i] != (i & 0xFF))
1983 return 1;
1984
1985 return 0;
1986}
1987
1988static void skd_log_check_status(struct skd_device *skdev, u8 status, u8 key,
1989 u8 code, u8 qual, u8 fruc)
1990{
1991 /* If the check condition is of special interest, log a message */
1992 if ((status == SAM_STAT_CHECK_CONDITION) && (key == 0x02)
1993 && (code == 0x04) && (qual == 0x06)) {
1994 pr_err("(%s): *** LOST_WRITE_DATA ERROR *** key/asc/"
1995 "ascq/fruc %02x/%02x/%02x/%02x\n",
1996 skd_name(skdev), key, code, qual, fruc);
1997 }
1998}
1999
2000static void skd_complete_internal(struct skd_device *skdev,
2001 volatile struct fit_completion_entry_v1
2002 *skcomp,
2003 volatile struct fit_comp_error_info *skerr,
2004 struct skd_special_context *skspcl)
2005{
2006 u8 *buf = skspcl->data_buf;
2007 u8 status;
2008 int i;
2009 struct skd_scsi_request *scsi =
2010 (struct skd_scsi_request *)&skspcl->msg_buf[64];
2011
2012 SKD_ASSERT(skspcl == &skdev->internal_skspcl);
2013
2014 pr_debug("%s:%s:%d complete internal %x\n",
2015 skdev->name, __func__, __LINE__, scsi->cdb[0]);
2016
2017 skspcl->req.completion = *skcomp;
2018 skspcl->req.state = SKD_REQ_STATE_IDLE;
2019 skspcl->req.id += SKD_ID_INCR;
2020
2021 status = skspcl->req.completion.status;
2022
2023 skd_log_check_status(skdev, status, skerr->key, skerr->code,
2024 skerr->qual, skerr->fruc);
2025
2026 switch (scsi->cdb[0]) {
2027 case TEST_UNIT_READY:
2028 if (status == SAM_STAT_GOOD)
2029 skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER);
2030 else if ((status == SAM_STAT_CHECK_CONDITION) &&
2031 (skerr->key == MEDIUM_ERROR))
2032 skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER);
2033 else {
2034 if (skdev->state == SKD_DRVR_STATE_STOPPING) {
2035 pr_debug("%s:%s:%d TUR failed, don't send anymore state 0x%x\n",
2036 skdev->name, __func__, __LINE__,
2037 skdev->state);
2038 return;
2039 }
2040 pr_debug("%s:%s:%d **** TUR failed, retry skerr\n",
2041 skdev->name, __func__, __LINE__);
2042 skd_send_internal_skspcl(skdev, skspcl, 0x00);
2043 }
2044 break;
2045
2046 case WRITE_BUFFER:
2047 if (status == SAM_STAT_GOOD)
2048 skd_send_internal_skspcl(skdev, skspcl, READ_BUFFER);
2049 else {
2050 if (skdev->state == SKD_DRVR_STATE_STOPPING) {
2051 pr_debug("%s:%s:%d write buffer failed, don't send anymore state 0x%x\n",
2052 skdev->name, __func__, __LINE__,
2053 skdev->state);
2054 return;
2055 }
2056 pr_debug("%s:%s:%d **** write buffer failed, retry skerr\n",
2057 skdev->name, __func__, __LINE__);
2058 skd_send_internal_skspcl(skdev, skspcl, 0x00);
2059 }
2060 break;
2061
2062 case READ_BUFFER:
2063 if (status == SAM_STAT_GOOD) {
2064 if (skd_chk_read_buf(skdev, skspcl) == 0)
2065 skd_send_internal_skspcl(skdev, skspcl,
2066 READ_CAPACITY);
2067 else {
2068 pr_err(
2069 "(%s):*** W/R Buffer mismatch %d ***\n",
2070 skd_name(skdev), skdev->connect_retries);
2071 if (skdev->connect_retries <
2072 SKD_MAX_CONNECT_RETRIES) {
2073 skdev->connect_retries++;
2074 skd_soft_reset(skdev);
2075 } else {
2076 pr_err(
2077 "(%s): W/R Buffer Connect Error\n",
2078 skd_name(skdev));
2079 return;
2080 }
2081 }
2082
2083 } else {
2084 if (skdev->state == SKD_DRVR_STATE_STOPPING) {
2085 pr_debug("%s:%s:%d "
2086 "read buffer failed, don't send anymore state 0x%x\n",
2087 skdev->name, __func__, __LINE__,
2088 skdev->state);
2089 return;
2090 }
2091 pr_debug("%s:%s:%d "
2092 "**** read buffer failed, retry skerr\n",
2093 skdev->name, __func__, __LINE__);
2094 skd_send_internal_skspcl(skdev, skspcl, 0x00);
2095 }
2096 break;
2097
2098 case READ_CAPACITY:
2099 skdev->read_cap_is_valid = 0;
2100 if (status == SAM_STAT_GOOD) {
2101 skdev->read_cap_last_lba =
2102 (buf[0] << 24) | (buf[1] << 16) |
2103 (buf[2] << 8) | buf[3];
2104 skdev->read_cap_blocksize =
2105 (buf[4] << 24) | (buf[5] << 16) |
2106 (buf[6] << 8) | buf[7];
2107
2108 pr_debug("%s:%s:%d last lba %d, bs %d\n",
2109 skdev->name, __func__, __LINE__,
2110 skdev->read_cap_last_lba,
2111 skdev->read_cap_blocksize);
2112
2113 set_capacity(skdev->disk, skdev->read_cap_last_lba + 1);
2114
2115 skdev->read_cap_is_valid = 1;
2116
2117 skd_send_internal_skspcl(skdev, skspcl, INQUIRY);
2118 } else if ((status == SAM_STAT_CHECK_CONDITION) &&
2119 (skerr->key == MEDIUM_ERROR)) {
2120 skdev->read_cap_last_lba = ~0;
2121 set_capacity(skdev->disk, skdev->read_cap_last_lba + 1);
2122 pr_debug("%s:%s:%d "
2123 "**** MEDIUM ERROR caused READCAP to fail, ignore failure and continue to inquiry\n",
2124 skdev->name, __func__, __LINE__);
2125 skd_send_internal_skspcl(skdev, skspcl, INQUIRY);
2126 } else {
2127 pr_debug("%s:%s:%d **** READCAP failed, retry TUR\n",
2128 skdev->name, __func__, __LINE__);
2129 skd_send_internal_skspcl(skdev, skspcl,
2130 TEST_UNIT_READY);
2131 }
2132 break;
2133
2134 case INQUIRY:
2135 skdev->inquiry_is_valid = 0;
2136 if (status == SAM_STAT_GOOD) {
2137 skdev->inquiry_is_valid = 1;
2138
2139 for (i = 0; i < 12; i++)
2140 skdev->inq_serial_num[i] = buf[i + 4];
2141 skdev->inq_serial_num[12] = 0;
2142 }
2143
2144 if (skd_unquiesce_dev(skdev) < 0)
2145 pr_debug("%s:%s:%d **** failed, to ONLINE device\n",
2146 skdev->name, __func__, __LINE__);
2147 /* connection is complete */
2148 skdev->connect_retries = 0;
2149 break;
2150
2151 case SYNCHRONIZE_CACHE:
2152 if (status == SAM_STAT_GOOD)
2153 skdev->sync_done = 1;
2154 else
2155 skdev->sync_done = -1;
2156 wake_up_interruptible(&skdev->waitq);
2157 break;
2158
2159 default:
2160 SKD_ASSERT("we didn't send this");
2161 }
2162}
2163
2164/*
2165 *****************************************************************************
2166 * FIT MESSAGES
2167 *****************************************************************************
2168 */
2169
2170static void skd_send_fitmsg(struct skd_device *skdev,
2171 struct skd_fitmsg_context *skmsg)
2172{
2173 u64 qcmd;
2174 struct fit_msg_hdr *fmh;
2175
2176 pr_debug("%s:%s:%d dma address 0x%llx, busy=%d\n",
2177 skdev->name, __func__, __LINE__,
2178 skmsg->mb_dma_address, skdev->in_flight);
2179 pr_debug("%s:%s:%d msg_buf 0x%p, offset %x\n",
2180 skdev->name, __func__, __LINE__,
2181 skmsg->msg_buf, skmsg->offset);
2182
2183 qcmd = skmsg->mb_dma_address;
2184 qcmd |= FIT_QCMD_QID_NORMAL;
2185
2186 fmh = (struct fit_msg_hdr *)skmsg->msg_buf;
2187 skmsg->outstanding = fmh->num_protocol_cmds_coalesced;
2188
2189 if (unlikely(skdev->dbg_level > 1)) {
2190 u8 *bp = (u8 *)skmsg->msg_buf;
2191 int i;
2192 for (i = 0; i < skmsg->length; i += 8) {
2193 pr_debug("%s:%s:%d msg[%2d] %02x %02x %02x %02x "
2194 "%02x %02x %02x %02x\n",
2195 skdev->name, __func__, __LINE__,
2196 i, bp[i + 0], bp[i + 1], bp[i + 2],
2197 bp[i + 3], bp[i + 4], bp[i + 5],
2198 bp[i + 6], bp[i + 7]);
2199 if (i == 0)
2200 i = 64 - 8;
2201 }
2202 }
2203
2204 if (skmsg->length > 256)
2205 qcmd |= FIT_QCMD_MSGSIZE_512;
2206 else if (skmsg->length > 128)
2207 qcmd |= FIT_QCMD_MSGSIZE_256;
2208 else if (skmsg->length > 64)
2209 qcmd |= FIT_QCMD_MSGSIZE_128;
2210 else
2211 /*
2212 * This makes no sense because the FIT msg header is
2213 * 64 bytes. If the msg is only 64 bytes long it has
2214 * no payload.
2215 */
2216 qcmd |= FIT_QCMD_MSGSIZE_64;
2217
2218 SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND);
2219
2220}
2221
2222static void skd_send_special_fitmsg(struct skd_device *skdev,
2223 struct skd_special_context *skspcl)
2224{
2225 u64 qcmd;
2226
2227 if (unlikely(skdev->dbg_level > 1)) {
2228 u8 *bp = (u8 *)skspcl->msg_buf;
2229 int i;
2230
2231 for (i = 0; i < SKD_N_SPECIAL_FITMSG_BYTES; i += 8) {
2232 pr_debug("%s:%s:%d spcl[%2d] %02x %02x %02x %02x "
2233 "%02x %02x %02x %02x\n",
2234 skdev->name, __func__, __LINE__, i,
2235 bp[i + 0], bp[i + 1], bp[i + 2], bp[i + 3],
2236 bp[i + 4], bp[i + 5], bp[i + 6], bp[i + 7]);
2237 if (i == 0)
2238 i = 64 - 8;
2239 }
2240
2241 pr_debug("%s:%s:%d skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n",
2242 skdev->name, __func__, __LINE__,
2243 skspcl, skspcl->req.id, skspcl->req.sksg_list,
2244 skspcl->req.sksg_dma_address);
2245 for (i = 0; i < skspcl->req.n_sg; i++) {
2246 struct fit_sg_descriptor *sgd =
2247 &skspcl->req.sksg_list[i];
2248
2249 pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x "
2250 "addr=0x%llx next=0x%llx\n",
2251 skdev->name, __func__, __LINE__,
2252 i, sgd->byte_count, sgd->control,
2253 sgd->host_side_addr, sgd->next_desc_ptr);
2254 }
2255 }
2256
2257 /*
2258 * Special FIT msgs are always 128 bytes: a 64-byte FIT hdr
2259 * and one 64-byte SSDI command.
2260 */
2261 qcmd = skspcl->mb_dma_address;
2262 qcmd |= FIT_QCMD_QID_NORMAL + FIT_QCMD_MSGSIZE_128;
2263
2264 SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND);
2265}
2266
2267/*
2268 *****************************************************************************
2269 * COMPLETION QUEUE
2270 *****************************************************************************
2271 */
2272
2273static void skd_complete_other(struct skd_device *skdev,
2274 volatile struct fit_completion_entry_v1 *skcomp,
2275 volatile struct fit_comp_error_info *skerr);
2276
2277struct sns_info {
2278 u8 type;
2279 u8 stat;
2280 u8 key;
2281 u8 asc;
2282 u8 ascq;
2283 u8 mask;
2284 enum skd_check_status_action action;
2285};
2286
2287static struct sns_info skd_chkstat_table[] = {
2288 /* Good */
2289 { 0x70, 0x02, RECOVERED_ERROR, 0, 0, 0x1c,
2290 SKD_CHECK_STATUS_REPORT_GOOD },
2291
2292 /* Smart alerts */
2293 { 0x70, 0x02, NO_SENSE, 0x0B, 0x00, 0x1E, /* warnings */
2294 SKD_CHECK_STATUS_REPORT_SMART_ALERT },
2295 { 0x70, 0x02, NO_SENSE, 0x5D, 0x00, 0x1E, /* thresholds */
2296 SKD_CHECK_STATUS_REPORT_SMART_ALERT },
2297 { 0x70, 0x02, RECOVERED_ERROR, 0x0B, 0x01, 0x1F, /* temperature over trigger */
2298 SKD_CHECK_STATUS_REPORT_SMART_ALERT },
2299
2300 /* Retry (with limits) */
2301 { 0x70, 0x02, 0x0B, 0, 0, 0x1C, /* This one is for DMA ERROR */
2302 SKD_CHECK_STATUS_REQUEUE_REQUEST },
2303 { 0x70, 0x02, 0x06, 0x0B, 0x00, 0x1E, /* warnings */
2304 SKD_CHECK_STATUS_REQUEUE_REQUEST },
2305 { 0x70, 0x02, 0x06, 0x5D, 0x00, 0x1E, /* thresholds */
2306 SKD_CHECK_STATUS_REQUEUE_REQUEST },
2307 { 0x70, 0x02, 0x06, 0x80, 0x30, 0x1F, /* backup power */
2308 SKD_CHECK_STATUS_REQUEUE_REQUEST },
2309
2310 /* Busy (or about to be) */
2311 { 0x70, 0x02, 0x06, 0x3f, 0x01, 0x1F, /* fw changed */
2312 SKD_CHECK_STATUS_BUSY_IMMINENT },
2313};
2314
2315/*
2316 * Look up status and sense data to decide how to handle the error
2317 * from the device.
2318 * mask says which fields must match e.g., mask=0x18 means check
2319 * type and stat, ignore key, asc, ascq.
2320 */
2321
2322static enum skd_check_status_action
2323skd_check_status(struct skd_device *skdev,
2324 u8 cmp_status, volatile struct fit_comp_error_info *skerr)
2325{
2326 int i, n;
2327
2328 pr_err("(%s): key/asc/ascq/fruc %02x/%02x/%02x/%02x\n",
2329 skd_name(skdev), skerr->key, skerr->code, skerr->qual,
2330 skerr->fruc);
2331
2332 pr_debug("%s:%s:%d stat: t=%02x stat=%02x k=%02x c=%02x q=%02x fruc=%02x\n",
2333 skdev->name, __func__, __LINE__, skerr->type, cmp_status,
2334 skerr->key, skerr->code, skerr->qual, skerr->fruc);
2335
2336 /* Does the info match an entry in the good category? */
2337 n = sizeof(skd_chkstat_table) / sizeof(skd_chkstat_table[0]);
2338 for (i = 0; i < n; i++) {
2339 struct sns_info *sns = &skd_chkstat_table[i];
2340
2341 if (sns->mask & 0x10)
2342 if (skerr->type != sns->type)
2343 continue;
2344
2345 if (sns->mask & 0x08)
2346 if (cmp_status != sns->stat)
2347 continue;
2348
2349 if (sns->mask & 0x04)
2350 if (skerr->key != sns->key)
2351 continue;
2352
2353 if (sns->mask & 0x02)
2354 if (skerr->code != sns->asc)
2355 continue;
2356
2357 if (sns->mask & 0x01)
2358 if (skerr->qual != sns->ascq)
2359 continue;
2360
2361 if (sns->action == SKD_CHECK_STATUS_REPORT_SMART_ALERT) {
2362 pr_err("(%s): SMART Alert: sense key/asc/ascq "
2363 "%02x/%02x/%02x\n",
2364 skd_name(skdev), skerr->key,
2365 skerr->code, skerr->qual);
2366 }
2367 return sns->action;
2368 }
2369
2370 /* No other match, so nonzero status means error,
2371 * zero status means good
2372 */
2373 if (cmp_status) {
2374 pr_debug("%s:%s:%d status check: error\n",
2375 skdev->name, __func__, __LINE__);
2376 return SKD_CHECK_STATUS_REPORT_ERROR;
2377 }
2378
2379 pr_debug("%s:%s:%d status check good default\n",
2380 skdev->name, __func__, __LINE__);
2381 return SKD_CHECK_STATUS_REPORT_GOOD;
2382}
2383
2384static void skd_resolve_req_exception(struct skd_device *skdev,
2385 struct skd_request_context *skreq)
2386{
2387 u8 cmp_status = skreq->completion.status;
2388
2389 switch (skd_check_status(skdev, cmp_status, &skreq->err_info)) {
2390 case SKD_CHECK_STATUS_REPORT_GOOD:
2391 case SKD_CHECK_STATUS_REPORT_SMART_ALERT:
2392 skd_end_request(skdev, skreq, 0);
2393 break;
2394
2395 case SKD_CHECK_STATUS_BUSY_IMMINENT:
2396 skd_log_skreq(skdev, skreq, "retry(busy)");
2397 blk_requeue_request(skdev->queue, skreq->req);
2398 pr_info("(%s) drive BUSY imminent\n", skd_name(skdev));
2399 skdev->state = SKD_DRVR_STATE_BUSY_IMMINENT;
2400 skdev->timer_countdown = SKD_TIMER_MINUTES(20);
2401 skd_quiesce_dev(skdev);
2402 break;
2403
2404 case SKD_CHECK_STATUS_REQUEUE_REQUEST:
2405 if ((unsigned long) ++skreq->req->special < SKD_MAX_RETRIES) {
2406 skd_log_skreq(skdev, skreq, "retry");
2407 blk_requeue_request(skdev->queue, skreq->req);
2408 break;
2409 }
2410 /* fall through to report error */
2411
2412 case SKD_CHECK_STATUS_REPORT_ERROR:
2413 default:
2414 skd_end_request(skdev, skreq, -EIO);
2415 break;
2416 }
2417}
2418
2419/* assume spinlock is already held */
2420static void skd_release_skreq(struct skd_device *skdev,
2421 struct skd_request_context *skreq)
2422{
2423 u32 msg_slot;
2424 struct skd_fitmsg_context *skmsg;
2425
2426 u32 timo_slot;
2427
2428 /*
2429 * Reclaim the FIT msg buffer if this is
2430 * the first of the requests it carried to
2431 * be completed. The FIT msg buffer used to
2432 * send this request cannot be reused until
2433 * we are sure the s1120 card has copied
2434 * it to its memory. The FIT msg might have
2435 * contained several requests. As soon as
2436 * any of them are completed we know that
2437 * the entire FIT msg was transferred.
2438 * Only the first completed request will
2439 * match the FIT msg buffer id. The FIT
2440 * msg buffer id is immediately updated.
2441 * When subsequent requests complete the FIT
2442 * msg buffer id won't match, so we know
2443 * quite cheaply that it is already done.
2444 */
2445 msg_slot = skreq->fitmsg_id & SKD_ID_SLOT_MASK;
2446 SKD_ASSERT(msg_slot < skdev->num_fitmsg_context);
2447
2448 skmsg = &skdev->skmsg_table[msg_slot];
2449 if (skmsg->id == skreq->fitmsg_id) {
2450 SKD_ASSERT(skmsg->state == SKD_MSG_STATE_BUSY);
2451 SKD_ASSERT(skmsg->outstanding > 0);
2452 skmsg->outstanding--;
2453 if (skmsg->outstanding == 0) {
2454 skmsg->state = SKD_MSG_STATE_IDLE;
2455 skmsg->id += SKD_ID_INCR;
2456 skmsg->next = skdev->skmsg_free_list;
2457 skdev->skmsg_free_list = skmsg;
2458 }
2459 }
2460
2461 /*
2462 * Decrease the number of active requests.
2463 * Also decrements the count in the timeout slot.
2464 */
2465 SKD_ASSERT(skdev->in_flight > 0);
2466 skdev->in_flight -= 1;
2467
2468 timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK;
2469 SKD_ASSERT(skdev->timeout_slot[timo_slot] > 0);
2470 skdev->timeout_slot[timo_slot] -= 1;
2471
2472 /*
2473 * Reset backpointer
2474 */
2475 skreq->req = NULL;
2476
2477 /*
2478 * Reclaim the skd_request_context
2479 */
2480 skreq->state = SKD_REQ_STATE_IDLE;
2481 skreq->id += SKD_ID_INCR;
2482 skreq->next = skdev->skreq_free_list;
2483 skdev->skreq_free_list = skreq;
2484}
2485
2486#define DRIVER_INQ_EVPD_PAGE_CODE 0xDA
2487
2488static void skd_do_inq_page_00(struct skd_device *skdev,
2489 volatile struct fit_completion_entry_v1 *skcomp,
2490 volatile struct fit_comp_error_info *skerr,
2491 uint8_t *cdb, uint8_t *buf)
2492{
2493 uint16_t insert_pt, max_bytes, drive_pages, drive_bytes, new_size;
2494
2495 /* Caller requested "supported pages". The driver needs to insert
2496 * its page.
2497 */
2498 pr_debug("%s:%s:%d skd_do_driver_inquiry: modify supported pages.\n",
2499 skdev->name, __func__, __LINE__);
2500
2501 /* If the device rejected the request because the CDB was
2502 * improperly formed, then just leave.
2503 */
2504 if (skcomp->status == SAM_STAT_CHECK_CONDITION &&
2505 skerr->key == ILLEGAL_REQUEST && skerr->code == 0x24)
2506 return;
2507
2508 /* Get the amount of space the caller allocated */
2509 max_bytes = (cdb[3] << 8) | cdb[4];
2510
2511 /* Get the number of pages actually returned by the device */
2512 drive_pages = (buf[2] << 8) | buf[3];
2513 drive_bytes = drive_pages + 4;
2514 new_size = drive_pages + 1;
2515
2516 /* Supported pages must be in numerical order, so find where
2517 * the driver page needs to be inserted into the list of
2518 * pages returned by the device.
2519 */
2520 for (insert_pt = 4; insert_pt < drive_bytes; insert_pt++) {
2521 if (buf[insert_pt] == DRIVER_INQ_EVPD_PAGE_CODE)
2522 return; /* Device using this page code. abort */
2523 else if (buf[insert_pt] > DRIVER_INQ_EVPD_PAGE_CODE)
2524 break;
2525 }
2526
2527 if (insert_pt < max_bytes) {
2528 uint16_t u;
2529
2530 /* Shift everything up one byte to make room. */
2531 for (u = new_size + 3; u > insert_pt; u--)
2532 buf[u] = buf[u - 1];
2533 buf[insert_pt] = DRIVER_INQ_EVPD_PAGE_CODE;
2534
2535 /* SCSI byte order increment of num_returned_bytes by 1 */
2536 skcomp->num_returned_bytes =
2537 be32_to_cpu(skcomp->num_returned_bytes) + 1;
2538 skcomp->num_returned_bytes =
2539 be32_to_cpu(skcomp->num_returned_bytes);
2540 }
2541
2542 /* update page length field to reflect the driver's page too */
2543 buf[2] = (uint8_t)((new_size >> 8) & 0xFF);
2544 buf[3] = (uint8_t)((new_size >> 0) & 0xFF);
2545}
2546
2547static void skd_get_link_info(struct pci_dev *pdev, u8 *speed, u8 *width)
2548{
2549 int pcie_reg;
2550 u16 pci_bus_speed;
2551 u8 pci_lanes;
2552
2553 pcie_reg = pci_find_capability(pdev, PCI_CAP_ID_EXP);
2554 if (pcie_reg) {
2555 u16 linksta;
2556 pci_read_config_word(pdev, pcie_reg + PCI_EXP_LNKSTA, &linksta);
2557
2558 pci_bus_speed = linksta & 0xF;
2559 pci_lanes = (linksta & 0x3F0) >> 4;
2560 } else {
2561 *speed = STEC_LINK_UNKNOWN;
2562 *width = 0xFF;
2563 return;
2564 }
2565
2566 switch (pci_bus_speed) {
2567 case 1:
2568 *speed = STEC_LINK_2_5GTS;
2569 break;
2570 case 2:
2571 *speed = STEC_LINK_5GTS;
2572 break;
2573 case 3:
2574 *speed = STEC_LINK_8GTS;
2575 break;
2576 default:
2577 *speed = STEC_LINK_UNKNOWN;
2578 break;
2579 }
2580
2581 if (pci_lanes <= 0x20)
2582 *width = pci_lanes;
2583 else
2584 *width = 0xFF;
2585}
2586
2587static void skd_do_inq_page_da(struct skd_device *skdev,
2588 volatile struct fit_completion_entry_v1 *skcomp,
2589 volatile struct fit_comp_error_info *skerr,
2590 uint8_t *cdb, uint8_t *buf)
2591{
2592 struct pci_dev *pdev = skdev->pdev;
2593 unsigned max_bytes;
2594 struct driver_inquiry_data inq;
2595 u16 val;
2596
2597 pr_debug("%s:%s:%d skd_do_driver_inquiry: return driver page\n",
2598 skdev->name, __func__, __LINE__);
2599
2600 memset(&inq, 0, sizeof(inq));
2601
2602 inq.page_code = DRIVER_INQ_EVPD_PAGE_CODE;
2603
2604 skd_get_link_info(pdev, &inq.pcie_link_speed, &inq.pcie_link_lanes);
2605 inq.pcie_bus_number = cpu_to_be16(pdev->bus->number);
2606 inq.pcie_device_number = PCI_SLOT(pdev->devfn);
2607 inq.pcie_function_number = PCI_FUNC(pdev->devfn);
2608
2609 pci_read_config_word(pdev, PCI_VENDOR_ID, &val);
2610 inq.pcie_vendor_id = cpu_to_be16(val);
2611
2612 pci_read_config_word(pdev, PCI_DEVICE_ID, &val);
2613 inq.pcie_device_id = cpu_to_be16(val);
2614
2615 pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID, &val);
2616 inq.pcie_subsystem_vendor_id = cpu_to_be16(val);
2617
2618 pci_read_config_word(pdev, PCI_SUBSYSTEM_ID, &val);
2619 inq.pcie_subsystem_device_id = cpu_to_be16(val);
2620
2621 /* Driver version, fixed lenth, padded with spaces on the right */
2622 inq.driver_version_length = sizeof(inq.driver_version);
2623 memset(&inq.driver_version, ' ', sizeof(inq.driver_version));
2624 memcpy(inq.driver_version, DRV_VER_COMPL,
2625 min(sizeof(inq.driver_version), strlen(DRV_VER_COMPL)));
2626
2627 inq.page_length = cpu_to_be16((sizeof(inq) - 4));
2628
2629 /* Clear the error set by the device */
2630 skcomp->status = SAM_STAT_GOOD;
2631 memset((void *)skerr, 0, sizeof(*skerr));
2632
2633 /* copy response into output buffer */
2634 max_bytes = (cdb[3] << 8) | cdb[4];
2635 memcpy(buf, &inq, min_t(unsigned, max_bytes, sizeof(inq)));
2636
2637 skcomp->num_returned_bytes =
2638 be32_to_cpu(min_t(uint16_t, max_bytes, sizeof(inq)));
2639}
2640
2641static void skd_do_driver_inq(struct skd_device *skdev,
2642 volatile struct fit_completion_entry_v1 *skcomp,
2643 volatile struct fit_comp_error_info *skerr,
2644 uint8_t *cdb, uint8_t *buf)
2645{
2646 if (!buf)
2647 return;
2648 else if (cdb[0] != INQUIRY)
2649 return; /* Not an INQUIRY */
2650 else if ((cdb[1] & 1) == 0)
2651 return; /* EVPD not set */
2652 else if (cdb[2] == 0)
2653 /* Need to add driver's page to supported pages list */
2654 skd_do_inq_page_00(skdev, skcomp, skerr, cdb, buf);
2655 else if (cdb[2] == DRIVER_INQ_EVPD_PAGE_CODE)
2656 /* Caller requested driver's page */
2657 skd_do_inq_page_da(skdev, skcomp, skerr, cdb, buf);
2658}
2659
2660static unsigned char *skd_sg_1st_page_ptr(struct scatterlist *sg)
2661{
2662 if (!sg)
2663 return NULL;
2664 if (!sg_page(sg))
2665 return NULL;
2666 return sg_virt(sg);
2667}
2668
2669static void skd_process_scsi_inq(struct skd_device *skdev,
2670 volatile struct fit_completion_entry_v1
2671 *skcomp,
2672 volatile struct fit_comp_error_info *skerr,
2673 struct skd_special_context *skspcl)
2674{
2675 uint8_t *buf;
2676 struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf;
2677 struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1];
2678
2679 dma_sync_sg_for_cpu(skdev->class_dev, skspcl->req.sg, skspcl->req.n_sg,
2680 skspcl->req.sg_data_dir);
2681 buf = skd_sg_1st_page_ptr(skspcl->req.sg);
2682
2683 if (buf)
2684 skd_do_driver_inq(skdev, skcomp, skerr, scsi_req->cdb, buf);
2685}
2686
2687
2688static int skd_isr_completion_posted(struct skd_device *skdev,
2689 int limit, int *enqueued)
2690{
2691 volatile struct fit_completion_entry_v1 *skcmp = NULL;
2692 volatile struct fit_comp_error_info *skerr;
2693 u16 req_id;
2694 u32 req_slot;
2695 struct skd_request_context *skreq;
2696 u16 cmp_cntxt = 0;
2697 u8 cmp_status = 0;
2698 u8 cmp_cycle = 0;
2699 u32 cmp_bytes = 0;
2700 int rc = 0;
2701 int processed = 0;
2702
2703 for (;; ) {
2704 SKD_ASSERT(skdev->skcomp_ix < SKD_N_COMPLETION_ENTRY);
2705
2706 skcmp = &skdev->skcomp_table[skdev->skcomp_ix];
2707 cmp_cycle = skcmp->cycle;
2708 cmp_cntxt = skcmp->tag;
2709 cmp_status = skcmp->status;
2710 cmp_bytes = be32_to_cpu(skcmp->num_returned_bytes);
2711
2712 skerr = &skdev->skerr_table[skdev->skcomp_ix];
2713
2714 pr_debug("%s:%s:%d "
2715 "cycle=%d ix=%d got cycle=%d cmdctxt=0x%x stat=%d "
2716 "busy=%d rbytes=0x%x proto=%d\n",
2717 skdev->name, __func__, __LINE__, skdev->skcomp_cycle,
2718 skdev->skcomp_ix, cmp_cycle, cmp_cntxt, cmp_status,
2719 skdev->in_flight, cmp_bytes, skdev->proto_ver);
2720
2721 if (cmp_cycle != skdev->skcomp_cycle) {
2722 pr_debug("%s:%s:%d end of completions\n",
2723 skdev->name, __func__, __LINE__);
2724 break;
2725 }
2726 /*
2727 * Update the completion queue head index and possibly
2728 * the completion cycle count. 8-bit wrap-around.
2729 */
2730 skdev->skcomp_ix++;
2731 if (skdev->skcomp_ix >= SKD_N_COMPLETION_ENTRY) {
2732 skdev->skcomp_ix = 0;
2733 skdev->skcomp_cycle++;
2734 }
2735
2736 /*
2737 * The command context is a unique 32-bit ID. The low order
2738 * bits help locate the request. The request is usually a
2739 * r/w request (see skd_start() above) or a special request.
2740 */
2741 req_id = cmp_cntxt;
2742 req_slot = req_id & SKD_ID_SLOT_AND_TABLE_MASK;
2743
2744 /* Is this other than a r/w request? */
2745 if (req_slot >= skdev->num_req_context) {
2746 /*
2747 * This is not a completion for a r/w request.
2748 */
2749 skd_complete_other(skdev, skcmp, skerr);
2750 continue;
2751 }
2752
2753 skreq = &skdev->skreq_table[req_slot];
2754
2755 /*
2756 * Make sure the request ID for the slot matches.
2757 */
2758 if (skreq->id != req_id) {
2759 pr_debug("%s:%s:%d mismatch comp_id=0x%x req_id=0x%x\n",
2760 skdev->name, __func__, __LINE__,
2761 req_id, skreq->id);
2762 {
2763 u16 new_id = cmp_cntxt;
2764 pr_err("(%s): Completion mismatch "
2765 "comp_id=0x%04x skreq=0x%04x new=0x%04x\n",
2766 skd_name(skdev), req_id,
2767 skreq->id, new_id);
2768
2769 continue;
2770 }
2771 }
2772
2773 SKD_ASSERT(skreq->state == SKD_REQ_STATE_BUSY);
2774
2775 if (skreq->state == SKD_REQ_STATE_ABORTED) {
2776 pr_debug("%s:%s:%d reclaim req %p id=%04x\n",
2777 skdev->name, __func__, __LINE__,
2778 skreq, skreq->id);
2779 /* a previously timed out command can
2780 * now be cleaned up */
2781 skd_release_skreq(skdev, skreq);
2782 continue;
2783 }
2784
2785 skreq->completion = *skcmp;
2786 if (unlikely(cmp_status == SAM_STAT_CHECK_CONDITION)) {
2787 skreq->err_info = *skerr;
2788 skd_log_check_status(skdev, cmp_status, skerr->key,
2789 skerr->code, skerr->qual,
2790 skerr->fruc);
2791 }
2792 /* Release DMA resources for the request. */
2793 if (skreq->n_sg > 0)
2794 skd_postop_sg_list(skdev, skreq);
2795
2796 if (!skreq->req) {
2797 pr_debug("%s:%s:%d NULL backptr skdreq %p, "
2798 "req=0x%x req_id=0x%x\n",
2799 skdev->name, __func__, __LINE__,
2800 skreq, skreq->id, req_id);
2801 } else {
2802 /*
2803 * Capture the outcome and post it back to the
2804 * native request.
2805 */
2806 if (likely(cmp_status == SAM_STAT_GOOD))
2807 skd_end_request(skdev, skreq, 0);
2808 else
2809 skd_resolve_req_exception(skdev, skreq);
2810 }
2811
2812 /*
2813 * Release the skreq, its FIT msg (if one), timeout slot,
2814 * and queue depth.
2815 */
2816 skd_release_skreq(skdev, skreq);
2817
2818 /* skd_isr_comp_limit equal zero means no limit */
2819 if (limit) {
2820 if (++processed >= limit) {
2821 rc = 1;
2822 break;
2823 }
2824 }
2825 }
2826
2827 if ((skdev->state == SKD_DRVR_STATE_PAUSING)
2828 && (skdev->in_flight) == 0) {
2829 skdev->state = SKD_DRVR_STATE_PAUSED;
2830 wake_up_interruptible(&skdev->waitq);
2831 }
2832
2833 return rc;
2834}
2835
2836static void skd_complete_other(struct skd_device *skdev,
2837 volatile struct fit_completion_entry_v1 *skcomp,
2838 volatile struct fit_comp_error_info *skerr)
2839{
2840 u32 req_id = 0;
2841 u32 req_table;
2842 u32 req_slot;
2843 struct skd_special_context *skspcl;
2844
2845 req_id = skcomp->tag;
2846 req_table = req_id & SKD_ID_TABLE_MASK;
2847 req_slot = req_id & SKD_ID_SLOT_MASK;
2848
2849 pr_debug("%s:%s:%d table=0x%x id=0x%x slot=%d\n",
2850 skdev->name, __func__, __LINE__,
2851 req_table, req_id, req_slot);
2852
2853 /*
2854 * Based on the request id, determine how to dispatch this completion.
2855 * This swich/case is finding the good cases and forwarding the
2856 * completion entry. Errors are reported below the switch.
2857 */
2858 switch (req_table) {
2859 case SKD_ID_RW_REQUEST:
2860 /*
2861 * The caller, skd_completion_posted_isr() above,
2862 * handles r/w requests. The only way we get here
2863 * is if the req_slot is out of bounds.
2864 */
2865 break;
2866
2867 case SKD_ID_SPECIAL_REQUEST:
2868 /*
2869 * Make sure the req_slot is in bounds and that the id
2870 * matches.
2871 */
2872 if (req_slot < skdev->n_special) {
2873 skspcl = &skdev->skspcl_table[req_slot];
2874 if (skspcl->req.id == req_id &&
2875 skspcl->req.state == SKD_REQ_STATE_BUSY) {
2876 skd_complete_special(skdev,
2877 skcomp, skerr, skspcl);
2878 return;
2879 }
2880 }
2881 break;
2882
2883 case SKD_ID_INTERNAL:
2884 if (req_slot == 0) {
2885 skspcl = &skdev->internal_skspcl;
2886 if (skspcl->req.id == req_id &&
2887 skspcl->req.state == SKD_REQ_STATE_BUSY) {
2888 skd_complete_internal(skdev,
2889 skcomp, skerr, skspcl);
2890 return;
2891 }
2892 }
2893 break;
2894
2895 case SKD_ID_FIT_MSG:
2896 /*
2897 * These id's should never appear in a completion record.
2898 */
2899 break;
2900
2901 default:
2902 /*
2903 * These id's should never appear anywhere;
2904 */
2905 break;
2906 }
2907
2908 /*
2909 * If we get here it is a bad or stale id.
2910 */
2911}
2912
2913static void skd_complete_special(struct skd_device *skdev,
2914 volatile struct fit_completion_entry_v1
2915 *skcomp,
2916 volatile struct fit_comp_error_info *skerr,
2917 struct skd_special_context *skspcl)
2918{
2919 pr_debug("%s:%s:%d completing special request %p\n",
2920 skdev->name, __func__, __LINE__, skspcl);
2921 if (skspcl->orphaned) {
2922 /* Discard orphaned request */
2923 /* ?: Can this release directly or does it need
2924 * to use a worker? */
2925 pr_debug("%s:%s:%d release orphaned %p\n",
2926 skdev->name, __func__, __LINE__, skspcl);
2927 skd_release_special(skdev, skspcl);
2928 return;
2929 }
2930
2931 skd_process_scsi_inq(skdev, skcomp, skerr, skspcl);
2932
2933 skspcl->req.state = SKD_REQ_STATE_COMPLETED;
2934 skspcl->req.completion = *skcomp;
2935 skspcl->req.err_info = *skerr;
2936
2937 skd_log_check_status(skdev, skspcl->req.completion.status, skerr->key,
2938 skerr->code, skerr->qual, skerr->fruc);
2939
2940 wake_up_interruptible(&skdev->waitq);
2941}
2942
2943/* assume spinlock is already held */
2944static void skd_release_special(struct skd_device *skdev,
2945 struct skd_special_context *skspcl)
2946{
2947 int i, was_depleted;
2948
2949 for (i = 0; i < skspcl->req.n_sg; i++) {
2950 struct page *page = sg_page(&skspcl->req.sg[i]);
2951 __free_page(page);
2952 }
2953
2954 was_depleted = (skdev->skspcl_free_list == NULL);
2955
2956 skspcl->req.state = SKD_REQ_STATE_IDLE;
2957 skspcl->req.id += SKD_ID_INCR;
2958 skspcl->req.next =
2959 (struct skd_request_context *)skdev->skspcl_free_list;
2960 skdev->skspcl_free_list = (struct skd_special_context *)skspcl;
2961
2962 if (was_depleted) {
2963 pr_debug("%s:%s:%d skspcl was depleted\n",
2964 skdev->name, __func__, __LINE__);
2965 /* Free list was depleted. Their might be waiters. */
2966 wake_up_interruptible(&skdev->waitq);
2967 }
2968}
2969
2970static void skd_reset_skcomp(struct skd_device *skdev)
2971{
2972 u32 nbytes;
2973 struct fit_completion_entry_v1 *skcomp;
2974
2975 nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY;
2976 nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY;
2977
2978 memset(skdev->skcomp_table, 0, nbytes);
2979
2980 skdev->skcomp_ix = 0;
2981 skdev->skcomp_cycle = 1;
2982}
2983
2984/*
2985 *****************************************************************************
2986 * INTERRUPTS
2987 *****************************************************************************
2988 */
2989static void skd_completion_worker(struct work_struct *work)
2990{
2991 struct skd_device *skdev =
2992 container_of(work, struct skd_device, completion_worker);
2993 unsigned long flags;
2994 int flush_enqueued = 0;
2995
2996 spin_lock_irqsave(&skdev->lock, flags);
2997
2998 /*
2999 * pass in limit=0, which means no limit..
3000 * process everything in compq
3001 */
3002 skd_isr_completion_posted(skdev, 0, &flush_enqueued);
3003 skd_request_fn(skdev->queue);
3004
3005 spin_unlock_irqrestore(&skdev->lock, flags);
3006}
3007
3008static void skd_isr_msg_from_dev(struct skd_device *skdev);
3009
3010irqreturn_t
3011static skd_isr(int irq, void *ptr)
3012{
3013 struct skd_device *skdev;
3014 u32 intstat;
3015 u32 ack;
3016 int rc = 0;
3017 int deferred = 0;
3018 int flush_enqueued = 0;
3019
3020 skdev = (struct skd_device *)ptr;
3021 spin_lock(&skdev->lock);
3022
3023 for (;; ) {
3024 intstat = SKD_READL(skdev, FIT_INT_STATUS_HOST);
3025
3026 ack = FIT_INT_DEF_MASK;
3027 ack &= intstat;
3028
3029 pr_debug("%s:%s:%d intstat=0x%x ack=0x%x\n",
3030 skdev->name, __func__, __LINE__, intstat, ack);
3031
3032 /* As long as there is an int pending on device, keep
3033 * running loop. When none, get out, but if we've never
3034 * done any processing, call completion handler?
3035 */
3036 if (ack == 0) {
3037 /* No interrupts on device, but run the completion
3038 * processor anyway?
3039 */
3040 if (rc == 0)
3041 if (likely (skdev->state
3042 == SKD_DRVR_STATE_ONLINE))
3043 deferred = 1;
3044 break;
3045 }
3046
3047 rc = IRQ_HANDLED;
3048
3049 SKD_WRITEL(skdev, ack, FIT_INT_STATUS_HOST);
3050
3051 if (likely((skdev->state != SKD_DRVR_STATE_LOAD) &&
3052 (skdev->state != SKD_DRVR_STATE_STOPPING))) {
3053 if (intstat & FIT_ISH_COMPLETION_POSTED) {
3054 /*
3055 * If we have already deferred completion
3056 * processing, don't bother running it again
3057 */
3058 if (deferred == 0)
3059 deferred =
3060 skd_isr_completion_posted(skdev,
3061 skd_isr_comp_limit, &flush_enqueued);
3062 }
3063
3064 if (intstat & FIT_ISH_FW_STATE_CHANGE) {
3065 skd_isr_fwstate(skdev);
3066 if (skdev->state == SKD_DRVR_STATE_FAULT ||
3067 skdev->state ==
3068 SKD_DRVR_STATE_DISAPPEARED) {
3069 spin_unlock(&skdev->lock);
3070 return rc;
3071 }
3072 }
3073
3074 if (intstat & FIT_ISH_MSG_FROM_DEV)
3075 skd_isr_msg_from_dev(skdev);
3076 }
3077 }
3078
3079 if (unlikely(flush_enqueued))
3080 skd_request_fn(skdev->queue);
3081
3082 if (deferred)
3083 schedule_work(&skdev->completion_worker);
3084 else if (!flush_enqueued)
3085 skd_request_fn(skdev->queue);
3086
3087 spin_unlock(&skdev->lock);
3088
3089 return rc;
3090}
3091
3092static void skd_drive_fault(struct skd_device *skdev)
3093{
3094 skdev->state = SKD_DRVR_STATE_FAULT;
3095 pr_err("(%s): Drive FAULT\n", skd_name(skdev));
3096}
3097
3098static void skd_drive_disappeared(struct skd_device *skdev)
3099{
3100 skdev->state = SKD_DRVR_STATE_DISAPPEARED;
3101 pr_err("(%s): Drive DISAPPEARED\n", skd_name(skdev));
3102}
3103
3104static void skd_isr_fwstate(struct skd_device *skdev)
3105{
3106 u32 sense;
3107 u32 state;
3108 u32 mtd;
3109 int prev_driver_state = skdev->state;
3110
3111 sense = SKD_READL(skdev, FIT_STATUS);
3112 state = sense & FIT_SR_DRIVE_STATE_MASK;
3113
3114 pr_err("(%s): s1120 state %s(%d)=>%s(%d)\n",
3115 skd_name(skdev),
3116 skd_drive_state_to_str(skdev->drive_state), skdev->drive_state,
3117 skd_drive_state_to_str(state), state);
3118
3119 skdev->drive_state = state;
3120
3121 switch (skdev->drive_state) {
3122 case FIT_SR_DRIVE_INIT:
3123 if (skdev->state == SKD_DRVR_STATE_PROTOCOL_MISMATCH) {
3124 skd_disable_interrupts(skdev);
3125 break;
3126 }
3127 if (skdev->state == SKD_DRVR_STATE_RESTARTING)
3128 skd_recover_requests(skdev, 0);
3129 if (skdev->state == SKD_DRVR_STATE_WAIT_BOOT) {
3130 skdev->timer_countdown = SKD_STARTING_TIMO;
3131 skdev->state = SKD_DRVR_STATE_STARTING;
3132 skd_soft_reset(skdev);
3133 break;
3134 }
3135 mtd = FIT_MXD_CONS(FIT_MTD_FITFW_INIT, 0, 0);
3136 SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
3137 skdev->last_mtd = mtd;
3138 break;
3139
3140 case FIT_SR_DRIVE_ONLINE:
3141 skdev->cur_max_queue_depth = skd_max_queue_depth;
3142 if (skdev->cur_max_queue_depth > skdev->dev_max_queue_depth)
3143 skdev->cur_max_queue_depth = skdev->dev_max_queue_depth;
3144
3145 skdev->queue_low_water_mark =
3146 skdev->cur_max_queue_depth * 2 / 3 + 1;
3147 if (skdev->queue_low_water_mark < 1)
3148 skdev->queue_low_water_mark = 1;
3149 pr_info(
3150 "(%s): Queue depth limit=%d dev=%d lowat=%d\n",
3151 skd_name(skdev),
3152 skdev->cur_max_queue_depth,
3153 skdev->dev_max_queue_depth, skdev->queue_low_water_mark);
3154
3155 skd_refresh_device_data(skdev);
3156 break;
3157
3158 case FIT_SR_DRIVE_BUSY:
3159 skdev->state = SKD_DRVR_STATE_BUSY;
3160 skdev->timer_countdown = SKD_BUSY_TIMO;
3161 skd_quiesce_dev(skdev);
3162 break;
3163 case FIT_SR_DRIVE_BUSY_SANITIZE:
3164 /* set timer for 3 seconds, we'll abort any unfinished
3165 * commands after that expires
3166 */
3167 skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE;
3168 skdev->timer_countdown = SKD_TIMER_SECONDS(3);
3169 blk_start_queue(skdev->queue);
3170 break;
3171 case FIT_SR_DRIVE_BUSY_ERASE:
3172 skdev->state = SKD_DRVR_STATE_BUSY_ERASE;
3173 skdev->timer_countdown = SKD_BUSY_TIMO;
3174 break;
3175 case FIT_SR_DRIVE_OFFLINE:
3176 skdev->state = SKD_DRVR_STATE_IDLE;
3177 break;
3178 case FIT_SR_DRIVE_SOFT_RESET:
3179 switch (skdev->state) {
3180 case SKD_DRVR_STATE_STARTING:
3181 case SKD_DRVR_STATE_RESTARTING:
3182 /* Expected by a caller of skd_soft_reset() */
3183 break;
3184 default:
3185 skdev->state = SKD_DRVR_STATE_RESTARTING;
3186 break;
3187 }
3188 break;
3189 case FIT_SR_DRIVE_FW_BOOTING:
3190 pr_debug("%s:%s:%d ISR FIT_SR_DRIVE_FW_BOOTING %s\n",
3191 skdev->name, __func__, __LINE__, skdev->name);
3192 skdev->state = SKD_DRVR_STATE_WAIT_BOOT;
3193 skdev->timer_countdown = SKD_WAIT_BOOT_TIMO;
3194 break;
3195
3196 case FIT_SR_DRIVE_DEGRADED:
3197 case FIT_SR_PCIE_LINK_DOWN:
3198 case FIT_SR_DRIVE_NEED_FW_DOWNLOAD:
3199 break;
3200
3201 case FIT_SR_DRIVE_FAULT:
3202 skd_drive_fault(skdev);
3203 skd_recover_requests(skdev, 0);
3204 blk_start_queue(skdev->queue);
3205 break;
3206
3207 /* PCIe bus returned all Fs? */
3208 case 0xFF:
3209 pr_info("(%s): state=0x%x sense=0x%x\n",
3210 skd_name(skdev), state, sense);
3211 skd_drive_disappeared(skdev);
3212 skd_recover_requests(skdev, 0);
3213 blk_start_queue(skdev->queue);
3214 break;
3215 default:
3216 /*
3217 * Uknown FW State. Wait for a state we recognize.
3218 */
3219 break;
3220 }
3221 pr_err("(%s): Driver state %s(%d)=>%s(%d)\n",
3222 skd_name(skdev),
3223 skd_skdev_state_to_str(prev_driver_state), prev_driver_state,
3224 skd_skdev_state_to_str(skdev->state), skdev->state);
3225}
3226
3227static void skd_recover_requests(struct skd_device *skdev, int requeue)
3228{
3229 int i;
3230
3231 for (i = 0; i < skdev->num_req_context; i++) {
3232 struct skd_request_context *skreq = &skdev->skreq_table[i];
3233
3234 if (skreq->state == SKD_REQ_STATE_BUSY) {
3235 skd_log_skreq(skdev, skreq, "recover");
3236
3237 SKD_ASSERT((skreq->id & SKD_ID_INCR) != 0);
3238 SKD_ASSERT(skreq->req != NULL);
3239
3240 /* Release DMA resources for the request. */
3241 if (skreq->n_sg > 0)
3242 skd_postop_sg_list(skdev, skreq);
3243
3244 if (requeue &&
3245 (unsigned long) ++skreq->req->special <
3246 SKD_MAX_RETRIES)
3247 blk_requeue_request(skdev->queue, skreq->req);
3248 else
3249 skd_end_request(skdev, skreq, -EIO);
3250
3251 skreq->req = NULL;
3252
3253 skreq->state = SKD_REQ_STATE_IDLE;
3254 skreq->id += SKD_ID_INCR;
3255 }
3256 if (i > 0)
3257 skreq[-1].next = skreq;
3258 skreq->next = NULL;
3259 }
3260 skdev->skreq_free_list = skdev->skreq_table;
3261
3262 for (i = 0; i < skdev->num_fitmsg_context; i++) {
3263 struct skd_fitmsg_context *skmsg = &skdev->skmsg_table[i];
3264
3265 if (skmsg->state == SKD_MSG_STATE_BUSY) {
3266 skd_log_skmsg(skdev, skmsg, "salvaged");
3267 SKD_ASSERT((skmsg->id & SKD_ID_INCR) != 0);
3268 skmsg->state = SKD_MSG_STATE_IDLE;
3269 skmsg->id += SKD_ID_INCR;
3270 }
3271 if (i > 0)
3272 skmsg[-1].next = skmsg;
3273 skmsg->next = NULL;
3274 }
3275 skdev->skmsg_free_list = skdev->skmsg_table;
3276
3277 for (i = 0; i < skdev->n_special; i++) {
3278 struct skd_special_context *skspcl = &skdev->skspcl_table[i];
3279
3280 /* If orphaned, reclaim it because it has already been reported
3281 * to the process as an error (it was just waiting for
3282 * a completion that didn't come, and now it will never come)
3283 * If busy, change to a state that will cause it to error
3284 * out in the wait routine and let it do the normal
3285 * reporting and reclaiming
3286 */
3287 if (skspcl->req.state == SKD_REQ_STATE_BUSY) {
3288 if (skspcl->orphaned) {
3289 pr_debug("%s:%s:%d orphaned %p\n",
3290 skdev->name, __func__, __LINE__,
3291 skspcl);
3292 skd_release_special(skdev, skspcl);
3293 } else {
3294 pr_debug("%s:%s:%d not orphaned %p\n",
3295 skdev->name, __func__, __LINE__,
3296 skspcl);
3297 skspcl->req.state = SKD_REQ_STATE_ABORTED;
3298 }
3299 }
3300 }
3301 skdev->skspcl_free_list = skdev->skspcl_table;
3302
3303 for (i = 0; i < SKD_N_TIMEOUT_SLOT; i++)
3304 skdev->timeout_slot[i] = 0;
3305
3306 skdev->in_flight = 0;
3307}
3308
3309static void skd_isr_msg_from_dev(struct skd_device *skdev)
3310{
3311 u32 mfd;
3312 u32 mtd;
3313 u32 data;
3314
3315 mfd = SKD_READL(skdev, FIT_MSG_FROM_DEVICE);
3316
3317 pr_debug("%s:%s:%d mfd=0x%x last_mtd=0x%x\n",
3318 skdev->name, __func__, __LINE__, mfd, skdev->last_mtd);
3319
3320 /* ignore any mtd that is an ack for something we didn't send */
3321 if (FIT_MXD_TYPE(mfd) != FIT_MXD_TYPE(skdev->last_mtd))
3322 return;
3323
3324 switch (FIT_MXD_TYPE(mfd)) {
3325 case FIT_MTD_FITFW_INIT:
3326 skdev->proto_ver = FIT_PROTOCOL_MAJOR_VER(mfd);
3327
3328 if (skdev->proto_ver != FIT_PROTOCOL_VERSION_1) {
3329 pr_err("(%s): protocol mismatch\n",
3330 skdev->name);
3331 pr_err("(%s): got=%d support=%d\n",
3332 skdev->name, skdev->proto_ver,
3333 FIT_PROTOCOL_VERSION_1);
3334 pr_err("(%s): please upgrade driver\n",
3335 skdev->name);
3336 skdev->state = SKD_DRVR_STATE_PROTOCOL_MISMATCH;
3337 skd_soft_reset(skdev);
3338 break;
3339 }
3340 mtd = FIT_MXD_CONS(FIT_MTD_GET_CMDQ_DEPTH, 0, 0);
3341 SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
3342 skdev->last_mtd = mtd;
3343 break;
3344
3345 case FIT_MTD_GET_CMDQ_DEPTH:
3346 skdev->dev_max_queue_depth = FIT_MXD_DATA(mfd);
3347 mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_DEPTH, 0,
3348 SKD_N_COMPLETION_ENTRY);
3349 SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
3350 skdev->last_mtd = mtd;
3351 break;
3352
3353 case FIT_MTD_SET_COMPQ_DEPTH:
3354 SKD_WRITEQ(skdev, skdev->cq_dma_address, FIT_MSG_TO_DEVICE_ARG);
3355 mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_ADDR, 0, 0);
3356 SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
3357 skdev->last_mtd = mtd;
3358 break;
3359
3360 case FIT_MTD_SET_COMPQ_ADDR:
3361 skd_reset_skcomp(skdev);
3362 mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_HOST_ID, 0, skdev->devno);
3363 SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
3364 skdev->last_mtd = mtd;
3365 break;
3366
3367 case FIT_MTD_CMD_LOG_HOST_ID:
3368 skdev->connect_time_stamp = get_seconds();
3369 data = skdev->connect_time_stamp & 0xFFFF;
3370 mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data);
3371 SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
3372 skdev->last_mtd = mtd;
3373 break;
3374
3375 case FIT_MTD_CMD_LOG_TIME_STAMP_LO:
3376 skdev->drive_jiffies = FIT_MXD_DATA(mfd);
3377 data = (skdev->connect_time_stamp >> 16) & 0xFFFF;
3378 mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_HI, 0, data);
3379 SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
3380 skdev->last_mtd = mtd;
3381 break;
3382
3383 case FIT_MTD_CMD_LOG_TIME_STAMP_HI:
3384 skdev->drive_jiffies |= (FIT_MXD_DATA(mfd) << 16);
3385 mtd = FIT_MXD_CONS(FIT_MTD_ARM_QUEUE, 0, 0);
3386 SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
3387 skdev->last_mtd = mtd;
3388
3389 pr_err("(%s): Time sync driver=0x%x device=0x%x\n",
3390 skd_name(skdev),
3391 skdev->connect_time_stamp, skdev->drive_jiffies);
3392 break;
3393
3394 case FIT_MTD_ARM_QUEUE:
3395 skdev->last_mtd = 0;
3396 /*
3397 * State should be, or soon will be, FIT_SR_DRIVE_ONLINE.
3398 */
3399 break;
3400
3401 default:
3402 break;
3403 }
3404}
3405
3406static void skd_disable_interrupts(struct skd_device *skdev)
3407{
3408 u32 sense;
3409
3410 sense = SKD_READL(skdev, FIT_CONTROL);
3411 sense &= ~FIT_CR_ENABLE_INTERRUPTS;
3412 SKD_WRITEL(skdev, sense, FIT_CONTROL);
3413 pr_debug("%s:%s:%d sense 0x%x\n",
3414 skdev->name, __func__, __LINE__, sense);
3415
3416 /* Note that the 1s is written. A 1-bit means
3417 * disable, a 0 means enable.
3418 */
3419 SKD_WRITEL(skdev, ~0, FIT_INT_MASK_HOST);
3420}
3421
3422static void skd_enable_interrupts(struct skd_device *skdev)
3423{
3424 u32 val;
3425
3426 /* unmask interrupts first */
3427 val = FIT_ISH_FW_STATE_CHANGE +
3428 FIT_ISH_COMPLETION_POSTED + FIT_ISH_MSG_FROM_DEV;
3429
3430 /* Note that the compliment of mask is written. A 1-bit means
3431 * disable, a 0 means enable. */
3432 SKD_WRITEL(skdev, ~val, FIT_INT_MASK_HOST);
3433 pr_debug("%s:%s:%d interrupt mask=0x%x\n",
3434 skdev->name, __func__, __LINE__, ~val);
3435
3436 val = SKD_READL(skdev, FIT_CONTROL);
3437 val |= FIT_CR_ENABLE_INTERRUPTS;
3438 pr_debug("%s:%s:%d control=0x%x\n",
3439 skdev->name, __func__, __LINE__, val);
3440 SKD_WRITEL(skdev, val, FIT_CONTROL);
3441}
3442
3443/*
3444 *****************************************************************************
3445 * START, STOP, RESTART, QUIESCE, UNQUIESCE
3446 *****************************************************************************
3447 */
3448
3449static void skd_soft_reset(struct skd_device *skdev)
3450{
3451 u32 val;
3452
3453 val = SKD_READL(skdev, FIT_CONTROL);
3454 val |= (FIT_CR_SOFT_RESET);
3455 pr_debug("%s:%s:%d control=0x%x\n",
3456 skdev->name, __func__, __LINE__, val);
3457 SKD_WRITEL(skdev, val, FIT_CONTROL);
3458}
3459
3460static void skd_start_device(struct skd_device *skdev)
3461{
3462 unsigned long flags;
3463 u32 sense;
3464 u32 state;
3465
3466 spin_lock_irqsave(&skdev->lock, flags);
3467
3468 /* ack all ghost interrupts */
3469 SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST);
3470
3471 sense = SKD_READL(skdev, FIT_STATUS);
3472
3473 pr_debug("%s:%s:%d initial status=0x%x\n",
3474 skdev->name, __func__, __LINE__, sense);
3475
3476 state = sense & FIT_SR_DRIVE_STATE_MASK;
3477 skdev->drive_state = state;
3478 skdev->last_mtd = 0;
3479
3480 skdev->state = SKD_DRVR_STATE_STARTING;
3481 skdev->timer_countdown = SKD_STARTING_TIMO;
3482
3483 skd_enable_interrupts(skdev);
3484
3485 switch (skdev->drive_state) {
3486 case FIT_SR_DRIVE_OFFLINE:
3487 pr_err("(%s): Drive offline...\n", skd_name(skdev));
3488 break;
3489
3490 case FIT_SR_DRIVE_FW_BOOTING:
3491 pr_debug("%s:%s:%d FIT_SR_DRIVE_FW_BOOTING %s\n",
3492 skdev->name, __func__, __LINE__, skdev->name);
3493 skdev->state = SKD_DRVR_STATE_WAIT_BOOT;
3494 skdev->timer_countdown = SKD_WAIT_BOOT_TIMO;
3495 break;
3496
3497 case FIT_SR_DRIVE_BUSY_SANITIZE:
3498 pr_info("(%s): Start: BUSY_SANITIZE\n",
3499 skd_name(skdev));
3500 skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE;
3501 skdev->timer_countdown = SKD_STARTED_BUSY_TIMO;
3502 break;
3503
3504 case FIT_SR_DRIVE_BUSY_ERASE:
3505 pr_info("(%s): Start: BUSY_ERASE\n", skd_name(skdev));
3506 skdev->state = SKD_DRVR_STATE_BUSY_ERASE;
3507 skdev->timer_countdown = SKD_STARTED_BUSY_TIMO;
3508 break;
3509
3510 case FIT_SR_DRIVE_INIT:
3511 case FIT_SR_DRIVE_ONLINE:
3512 skd_soft_reset(skdev);
3513 break;
3514
3515 case FIT_SR_DRIVE_BUSY:
3516 pr_err("(%s): Drive Busy...\n", skd_name(skdev));
3517 skdev->state = SKD_DRVR_STATE_BUSY;
3518 skdev->timer_countdown = SKD_STARTED_BUSY_TIMO;
3519 break;
3520
3521 case FIT_SR_DRIVE_SOFT_RESET:
3522 pr_err("(%s) drive soft reset in prog\n",
3523 skd_name(skdev));
3524 break;
3525
3526 case FIT_SR_DRIVE_FAULT:
3527 /* Fault state is bad...soft reset won't do it...
3528 * Hard reset, maybe, but does it work on device?
3529 * For now, just fault so the system doesn't hang.
3530 */
3531 skd_drive_fault(skdev);
3532 /*start the queue so we can respond with error to requests */
3533 pr_debug("%s:%s:%d starting %s queue\n",
3534 skdev->name, __func__, __LINE__, skdev->name);
3535 blk_start_queue(skdev->queue);
3536 skdev->gendisk_on = -1;
3537 wake_up_interruptible(&skdev->waitq);
3538 break;
3539
3540 case 0xFF:
3541 /* Most likely the device isn't there or isn't responding
3542 * to the BAR1 addresses. */
3543 skd_drive_disappeared(skdev);
3544 /*start the queue so we can respond with error to requests */
3545 pr_debug("%s:%s:%d starting %s queue to error-out reqs\n",
3546 skdev->name, __func__, __LINE__, skdev->name);
3547 blk_start_queue(skdev->queue);
3548 skdev->gendisk_on = -1;
3549 wake_up_interruptible(&skdev->waitq);
3550 break;
3551
3552 default:
3553 pr_err("(%s) Start: unknown state %x\n",
3554 skd_name(skdev), skdev->drive_state);
3555 break;
3556 }
3557
3558 state = SKD_READL(skdev, FIT_CONTROL);
3559 pr_debug("%s:%s:%d FIT Control Status=0x%x\n",
3560 skdev->name, __func__, __LINE__, state);
3561
3562 state = SKD_READL(skdev, FIT_INT_STATUS_HOST);
3563 pr_debug("%s:%s:%d Intr Status=0x%x\n",
3564 skdev->name, __func__, __LINE__, state);
3565
3566 state = SKD_READL(skdev, FIT_INT_MASK_HOST);
3567 pr_debug("%s:%s:%d Intr Mask=0x%x\n",
3568 skdev->name, __func__, __LINE__, state);
3569
3570 state = SKD_READL(skdev, FIT_MSG_FROM_DEVICE);
3571 pr_debug("%s:%s:%d Msg from Dev=0x%x\n",
3572 skdev->name, __func__, __LINE__, state);
3573
3574 state = SKD_READL(skdev, FIT_HW_VERSION);
3575 pr_debug("%s:%s:%d HW version=0x%x\n",
3576 skdev->name, __func__, __LINE__, state);
3577
3578 spin_unlock_irqrestore(&skdev->lock, flags);
3579}
3580
3581static void skd_stop_device(struct skd_device *skdev)
3582{
3583 unsigned long flags;
3584 struct skd_special_context *skspcl = &skdev->internal_skspcl;
3585 u32 dev_state;
3586 int i;
3587
3588 spin_lock_irqsave(&skdev->lock, flags);
3589
3590 if (skdev->state != SKD_DRVR_STATE_ONLINE) {
3591 pr_err("(%s): skd_stop_device not online no sync\n",
3592 skd_name(skdev));
3593 goto stop_out;
3594 }
3595
3596 if (skspcl->req.state != SKD_REQ_STATE_IDLE) {
3597 pr_err("(%s): skd_stop_device no special\n",
3598 skd_name(skdev));
3599 goto stop_out;
3600 }
3601
3602 skdev->state = SKD_DRVR_STATE_SYNCING;
3603 skdev->sync_done = 0;
3604
3605 skd_send_internal_skspcl(skdev, skspcl, SYNCHRONIZE_CACHE);
3606
3607 spin_unlock_irqrestore(&skdev->lock, flags);
3608
3609 wait_event_interruptible_timeout(skdev->waitq,
3610 (skdev->sync_done), (10 * HZ));
3611
3612 spin_lock_irqsave(&skdev->lock, flags);
3613
3614 switch (skdev->sync_done) {
3615 case 0:
3616 pr_err("(%s): skd_stop_device no sync\n",
3617 skd_name(skdev));
3618 break;
3619 case 1:
3620 pr_err("(%s): skd_stop_device sync done\n",
3621 skd_name(skdev));
3622 break;
3623 default:
3624 pr_err("(%s): skd_stop_device sync error\n",
3625 skd_name(skdev));
3626 }
3627
3628stop_out:
3629 skdev->state = SKD_DRVR_STATE_STOPPING;
3630 spin_unlock_irqrestore(&skdev->lock, flags);
3631
3632 skd_kill_timer(skdev);
3633
3634 spin_lock_irqsave(&skdev->lock, flags);
3635 skd_disable_interrupts(skdev);
3636
3637 /* ensure all ints on device are cleared */
3638 /* soft reset the device to unload with a clean slate */
3639 SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST);
3640 SKD_WRITEL(skdev, FIT_CR_SOFT_RESET, FIT_CONTROL);
3641
3642 spin_unlock_irqrestore(&skdev->lock, flags);
3643
3644 /* poll every 100ms, 1 second timeout */
3645 for (i = 0; i < 10; i++) {
3646 dev_state =
3647 SKD_READL(skdev, FIT_STATUS) & FIT_SR_DRIVE_STATE_MASK;
3648 if (dev_state == FIT_SR_DRIVE_INIT)
3649 break;
3650 set_current_state(TASK_INTERRUPTIBLE);
3651 schedule_timeout(msecs_to_jiffies(100));
3652 }
3653
3654 if (dev_state != FIT_SR_DRIVE_INIT)
3655 pr_err("(%s): skd_stop_device state error 0x%02x\n",
3656 skd_name(skdev), dev_state);
3657}
3658
3659/* assume spinlock is held */
3660static void skd_restart_device(struct skd_device *skdev)
3661{
3662 u32 state;
3663
3664 /* ack all ghost interrupts */
3665 SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST);
3666
3667 state = SKD_READL(skdev, FIT_STATUS);
3668
3669 pr_debug("%s:%s:%d drive status=0x%x\n",
3670 skdev->name, __func__, __LINE__, state);
3671
3672 state &= FIT_SR_DRIVE_STATE_MASK;
3673 skdev->drive_state = state;
3674 skdev->last_mtd = 0;
3675
3676 skdev->state = SKD_DRVR_STATE_RESTARTING;
3677 skdev->timer_countdown = SKD_RESTARTING_TIMO;
3678
3679 skd_soft_reset(skdev);
3680}
3681
3682/* assume spinlock is held */
3683static int skd_quiesce_dev(struct skd_device *skdev)
3684{
3685 int rc = 0;
3686
3687 switch (skdev->state) {
3688 case SKD_DRVR_STATE_BUSY:
3689 case SKD_DRVR_STATE_BUSY_IMMINENT:
3690 pr_debug("%s:%s:%d stopping %s queue\n",
3691 skdev->name, __func__, __LINE__, skdev->name);
3692 blk_stop_queue(skdev->queue);
3693 break;
3694 case SKD_DRVR_STATE_ONLINE:
3695 case SKD_DRVR_STATE_STOPPING:
3696 case SKD_DRVR_STATE_SYNCING:
3697 case SKD_DRVR_STATE_PAUSING:
3698 case SKD_DRVR_STATE_PAUSED:
3699 case SKD_DRVR_STATE_STARTING:
3700 case SKD_DRVR_STATE_RESTARTING:
3701 case SKD_DRVR_STATE_RESUMING:
3702 default:
3703 rc = -EINVAL;
3704 pr_debug("%s:%s:%d state [%d] not implemented\n",
3705 skdev->name, __func__, __LINE__, skdev->state);
3706 }
3707 return rc;
3708}
3709
3710/* assume spinlock is held */
3711static int skd_unquiesce_dev(struct skd_device *skdev)
3712{
3713 int prev_driver_state = skdev->state;
3714
3715 skd_log_skdev(skdev, "unquiesce");
3716 if (skdev->state == SKD_DRVR_STATE_ONLINE) {
3717 pr_debug("%s:%s:%d **** device already ONLINE\n",
3718 skdev->name, __func__, __LINE__);
3719 return 0;
3720 }
3721 if (skdev->drive_state != FIT_SR_DRIVE_ONLINE) {
3722 /*
3723 * If there has been an state change to other than
3724 * ONLINE, we will rely on controller state change
3725 * to come back online and restart the queue.
3726 * The BUSY state means that driver is ready to
3727 * continue normal processing but waiting for controller
3728 * to become available.
3729 */
3730 skdev->state = SKD_DRVR_STATE_BUSY;
3731 pr_debug("%s:%s:%d drive BUSY state\n",
3732 skdev->name, __func__, __LINE__);
3733 return 0;
3734 }
3735
3736 /*
3737 * Drive has just come online, driver is either in startup,
3738 * paused performing a task, or bust waiting for hardware.
3739 */
3740 switch (skdev->state) {
3741 case SKD_DRVR_STATE_PAUSED:
3742 case SKD_DRVR_STATE_BUSY:
3743 case SKD_DRVR_STATE_BUSY_IMMINENT:
3744 case SKD_DRVR_STATE_BUSY_ERASE:
3745 case SKD_DRVR_STATE_STARTING:
3746 case SKD_DRVR_STATE_RESTARTING:
3747 case SKD_DRVR_STATE_FAULT:
3748 case SKD_DRVR_STATE_IDLE:
3749 case SKD_DRVR_STATE_LOAD:
3750 skdev->state = SKD_DRVR_STATE_ONLINE;
3751 pr_err("(%s): Driver state %s(%d)=>%s(%d)\n",
3752 skd_name(skdev),
3753 skd_skdev_state_to_str(prev_driver_state),
3754 prev_driver_state, skd_skdev_state_to_str(skdev->state),
3755 skdev->state);
3756 pr_debug("%s:%s:%d **** device ONLINE...starting block queue\n",
3757 skdev->name, __func__, __LINE__);
3758 pr_debug("%s:%s:%d starting %s queue\n",
3759 skdev->name, __func__, __LINE__, skdev->name);
3760 pr_info("(%s): STEC s1120 ONLINE\n", skd_name(skdev));
3761 blk_start_queue(skdev->queue);
3762 skdev->gendisk_on = 1;
3763 wake_up_interruptible(&skdev->waitq);
3764 break;
3765
3766 case SKD_DRVR_STATE_DISAPPEARED:
3767 default:
3768 pr_debug("%s:%s:%d **** driver state %d, not implemented \n",
3769 skdev->name, __func__, __LINE__,
3770 skdev->state);
3771 return -EBUSY;
3772 }
3773 return 0;
3774}
3775
3776/*
3777 *****************************************************************************
3778 * PCIe MSI/MSI-X INTERRUPT HANDLERS
3779 *****************************************************************************
3780 */
3781
3782static irqreturn_t skd_reserved_isr(int irq, void *skd_host_data)
3783{
3784 struct skd_device *skdev = skd_host_data;
3785 unsigned long flags;
3786
3787 spin_lock_irqsave(&skdev->lock, flags);
3788 pr_debug("%s:%s:%d MSIX = 0x%x\n",
3789 skdev->name, __func__, __LINE__,
3790 SKD_READL(skdev, FIT_INT_STATUS_HOST));
3791 pr_err("(%s): MSIX reserved irq %d = 0x%x\n", skd_name(skdev),
3792 irq, SKD_READL(skdev, FIT_INT_STATUS_HOST));
3793 SKD_WRITEL(skdev, FIT_INT_RESERVED_MASK, FIT_INT_STATUS_HOST);
3794 spin_unlock_irqrestore(&skdev->lock, flags);
3795 return IRQ_HANDLED;
3796}
3797
3798static irqreturn_t skd_statec_isr(int irq, void *skd_host_data)
3799{
3800 struct skd_device *skdev = skd_host_data;
3801 unsigned long flags;
3802
3803 spin_lock_irqsave(&skdev->lock, flags);
3804 pr_debug("%s:%s:%d MSIX = 0x%x\n",
3805 skdev->name, __func__, __LINE__,
3806 SKD_READL(skdev, FIT_INT_STATUS_HOST));
3807 SKD_WRITEL(skdev, FIT_ISH_FW_STATE_CHANGE, FIT_INT_STATUS_HOST);
3808 skd_isr_fwstate(skdev);
3809 spin_unlock_irqrestore(&skdev->lock, flags);
3810 return IRQ_HANDLED;
3811}
3812
3813static irqreturn_t skd_comp_q(int irq, void *skd_host_data)
3814{
3815 struct skd_device *skdev = skd_host_data;
3816 unsigned long flags;
3817 int flush_enqueued = 0;
3818 int deferred;
3819
3820 spin_lock_irqsave(&skdev->lock, flags);
3821 pr_debug("%s:%s:%d MSIX = 0x%x\n",
3822 skdev->name, __func__, __LINE__,
3823 SKD_READL(skdev, FIT_INT_STATUS_HOST));
3824 SKD_WRITEL(skdev, FIT_ISH_COMPLETION_POSTED, FIT_INT_STATUS_HOST);
3825 deferred = skd_isr_completion_posted(skdev, skd_isr_comp_limit,
3826 &flush_enqueued);
3827 if (flush_enqueued)
3828 skd_request_fn(skdev->queue);
3829
3830 if (deferred)
3831 schedule_work(&skdev->completion_worker);
3832 else if (!flush_enqueued)
3833 skd_request_fn(skdev->queue);
3834
3835 spin_unlock_irqrestore(&skdev->lock, flags);
3836
3837 return IRQ_HANDLED;
3838}
3839
3840static irqreturn_t skd_msg_isr(int irq, void *skd_host_data)
3841{
3842 struct skd_device *skdev = skd_host_data;
3843 unsigned long flags;
3844
3845 spin_lock_irqsave(&skdev->lock, flags);
3846 pr_debug("%s:%s:%d MSIX = 0x%x\n",
3847 skdev->name, __func__, __LINE__,
3848 SKD_READL(skdev, FIT_INT_STATUS_HOST));
3849 SKD_WRITEL(skdev, FIT_ISH_MSG_FROM_DEV, FIT_INT_STATUS_HOST);
3850 skd_isr_msg_from_dev(skdev);
3851 spin_unlock_irqrestore(&skdev->lock, flags);
3852 return IRQ_HANDLED;
3853}
3854
3855static irqreturn_t skd_qfull_isr(int irq, void *skd_host_data)
3856{
3857 struct skd_device *skdev = skd_host_data;
3858 unsigned long flags;
3859
3860 spin_lock_irqsave(&skdev->lock, flags);
3861 pr_debug("%s:%s:%d MSIX = 0x%x\n",
3862 skdev->name, __func__, __LINE__,
3863 SKD_READL(skdev, FIT_INT_STATUS_HOST));
3864 SKD_WRITEL(skdev, FIT_INT_QUEUE_FULL, FIT_INT_STATUS_HOST);
3865 spin_unlock_irqrestore(&skdev->lock, flags);
3866 return IRQ_HANDLED;
3867}
3868
3869/*
3870 *****************************************************************************
3871 * PCIe MSI/MSI-X SETUP
3872 *****************************************************************************
3873 */
3874
3875struct skd_msix_entry {
3876 int have_irq;
3877 u32 vector;
3878 u32 entry;
3879 struct skd_device *rsp;
3880 char isr_name[30];
3881};
3882
3883struct skd_init_msix_entry {
3884 const char *name;
3885 irq_handler_t handler;
3886};
3887
3888#define SKD_MAX_MSIX_COUNT 13
3889#define SKD_MIN_MSIX_COUNT 7
3890#define SKD_BASE_MSIX_IRQ 4
3891
3892static struct skd_init_msix_entry msix_entries[SKD_MAX_MSIX_COUNT] = {
3893 { "(DMA 0)", skd_reserved_isr },
3894 { "(DMA 1)", skd_reserved_isr },
3895 { "(DMA 2)", skd_reserved_isr },
3896 { "(DMA 3)", skd_reserved_isr },
3897 { "(State Change)", skd_statec_isr },
3898 { "(COMPL_Q)", skd_comp_q },
3899 { "(MSG)", skd_msg_isr },
3900 { "(Reserved)", skd_reserved_isr },
3901 { "(Reserved)", skd_reserved_isr },
3902 { "(Queue Full 0)", skd_qfull_isr },
3903 { "(Queue Full 1)", skd_qfull_isr },
3904 { "(Queue Full 2)", skd_qfull_isr },
3905 { "(Queue Full 3)", skd_qfull_isr },
3906};
3907
3908static void skd_release_msix(struct skd_device *skdev)
3909{
3910 struct skd_msix_entry *qentry;
3911 int i;
3912
3913 if (skdev->msix_entries == NULL)
3914 return;
3915 for (i = 0; i < skdev->msix_count; i++) {
3916 qentry = &skdev->msix_entries[i];
3917 skdev = qentry->rsp;
3918
3919 if (qentry->have_irq)
3920 devm_free_irq(&skdev->pdev->dev,
3921 qentry->vector, qentry->rsp);
3922 }
3923 pci_disable_msix(skdev->pdev);
3924 kfree(skdev->msix_entries);
3925 skdev->msix_count = 0;
3926 skdev->msix_entries = NULL;
3927}
3928
3929static int skd_acquire_msix(struct skd_device *skdev)
3930{
3931 int i, rc;
3932 struct pci_dev *pdev;
3933 struct msix_entry *entries = NULL;
3934 struct skd_msix_entry *qentry;
3935
3936 pdev = skdev->pdev;
3937 skdev->msix_count = SKD_MAX_MSIX_COUNT;
3938 entries = kzalloc(sizeof(struct msix_entry) * SKD_MAX_MSIX_COUNT,
3939 GFP_KERNEL);
3940 if (!entries)
3941 return -ENOMEM;
3942
3943 for (i = 0; i < SKD_MAX_MSIX_COUNT; i++)
3944 entries[i].entry = i;
3945
3946 rc = pci_enable_msix(pdev, entries, SKD_MAX_MSIX_COUNT);
3947 if (rc < 0)
3948 goto msix_out;
3949 if (rc) {
3950 if (rc < SKD_MIN_MSIX_COUNT) {
3951 pr_err("(%s): failed to enable MSI-X %d\n",
3952 skd_name(skdev), rc);
3953 goto msix_out;
3954 }
3955 pr_debug("%s:%s:%d %s: <%s> allocated %d MSI-X vectors\n",
3956 skdev->name, __func__, __LINE__,
3957 pci_name(pdev), skdev->name, rc);
3958
3959 skdev->msix_count = rc;
3960 rc = pci_enable_msix(pdev, entries, skdev->msix_count);
3961 if (rc) {
3962 pr_err("(%s): failed to enable MSI-X "
3963 "support (%d) %d\n",
3964 skd_name(skdev), skdev->msix_count, rc);
3965 goto msix_out;
3966 }
3967 }
3968 skdev->msix_entries = kzalloc(sizeof(struct skd_msix_entry) *
3969 skdev->msix_count, GFP_KERNEL);
3970 if (!skdev->msix_entries) {
3971 rc = -ENOMEM;
3972 skdev->msix_count = 0;
3973 pr_err("(%s): msix table allocation error\n",
3974 skd_name(skdev));
3975 goto msix_out;
3976 }
3977
3978 qentry = skdev->msix_entries;
3979 for (i = 0; i < skdev->msix_count; i++) {
3980 qentry->vector = entries[i].vector;
3981 qentry->entry = entries[i].entry;
3982 qentry->rsp = NULL;
3983 qentry->have_irq = 0;
3984 pr_debug("%s:%s:%d %s: <%s> msix (%d) vec %d, entry %x\n",
3985 skdev->name, __func__, __LINE__,
3986 pci_name(pdev), skdev->name,
3987 i, qentry->vector, qentry->entry);
3988 qentry++;
3989 }
3990
3991 /* Enable MSI-X vectors for the base queue */
3992 for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) {
3993 qentry = &skdev->msix_entries[i];
3994 snprintf(qentry->isr_name, sizeof(qentry->isr_name),
3995 "%s%d-msix %s", DRV_NAME, skdev->devno,
3996 msix_entries[i].name);
3997 rc = devm_request_irq(&skdev->pdev->dev, qentry->vector,
3998 msix_entries[i].handler, 0,
3999 qentry->isr_name, skdev);
4000 if (rc) {
4001 pr_err("(%s): Unable to register(%d) MSI-X "
4002 "handler %d: %s\n",
4003 skd_name(skdev), rc, i, qentry->isr_name);
4004 goto msix_out;
4005 } else {
4006 qentry->have_irq = 1;
4007 qentry->rsp = skdev;
4008 }
4009 }
4010 pr_debug("%s:%s:%d %s: <%s> msix %d irq(s) enabled\n",
4011 skdev->name, __func__, __LINE__,
4012 pci_name(pdev), skdev->name, skdev->msix_count);
4013 return 0;
4014
4015msix_out:
4016 if (entries)
4017 kfree(entries);
4018 skd_release_msix(skdev);
4019 return rc;
4020}
4021
4022static int skd_acquire_irq(struct skd_device *skdev)
4023{
4024 int rc;
4025 struct pci_dev *pdev;
4026
4027 pdev = skdev->pdev;
4028 skdev->msix_count = 0;
4029
4030RETRY_IRQ_TYPE:
4031 switch (skdev->irq_type) {
4032 case SKD_IRQ_MSIX:
4033 rc = skd_acquire_msix(skdev);
4034 if (!rc)
4035 pr_info("(%s): MSI-X %d irqs enabled\n",
4036 skd_name(skdev), skdev->msix_count);
4037 else {
4038 pr_err(
4039 "(%s): failed to enable MSI-X, re-trying with MSI %d\n",
4040 skd_name(skdev), rc);
4041 skdev->irq_type = SKD_IRQ_MSI;
4042 goto RETRY_IRQ_TYPE;
4043 }
4044 break;
4045 case SKD_IRQ_MSI:
4046 snprintf(skdev->isr_name, sizeof(skdev->isr_name), "%s%d-msi",
4047 DRV_NAME, skdev->devno);
4048 rc = pci_enable_msi(pdev);
4049 if (!rc) {
4050 rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr, 0,
4051 skdev->isr_name, skdev);
4052 if (rc) {
4053 pci_disable_msi(pdev);
4054 pr_err(
4055 "(%s): failed to allocate the MSI interrupt %d\n",
4056 skd_name(skdev), rc);
4057 goto RETRY_IRQ_LEGACY;
4058 }
4059 pr_info("(%s): MSI irq %d enabled\n",
4060 skd_name(skdev), pdev->irq);
4061 } else {
4062RETRY_IRQ_LEGACY:
4063 pr_err(
4064 "(%s): failed to enable MSI, re-trying with LEGACY %d\n",
4065 skd_name(skdev), rc);
4066 skdev->irq_type = SKD_IRQ_LEGACY;
4067 goto RETRY_IRQ_TYPE;
4068 }
4069 break;
4070 case SKD_IRQ_LEGACY:
4071 snprintf(skdev->isr_name, sizeof(skdev->isr_name),
4072 "%s%d-legacy", DRV_NAME, skdev->devno);
4073 rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr,
4074 IRQF_SHARED, skdev->isr_name, skdev);
4075 if (!rc)
4076 pr_info("(%s): LEGACY irq %d enabled\n",
4077 skd_name(skdev), pdev->irq);
4078 else
4079 pr_err("(%s): request LEGACY irq error %d\n",
4080 skd_name(skdev), rc);
4081 break;
4082 default:
4083 pr_info("(%s): irq_type %d invalid, re-set to %d\n",
4084 skd_name(skdev), skdev->irq_type, SKD_IRQ_DEFAULT);
4085 skdev->irq_type = SKD_IRQ_LEGACY;
4086 goto RETRY_IRQ_TYPE;
4087 }
4088 return rc;
4089}
4090
4091static void skd_release_irq(struct skd_device *skdev)
4092{
4093 switch (skdev->irq_type) {
4094 case SKD_IRQ_MSIX:
4095 skd_release_msix(skdev);
4096 break;
4097 case SKD_IRQ_MSI:
4098 devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev);
4099 pci_disable_msi(skdev->pdev);
4100 break;
4101 case SKD_IRQ_LEGACY:
4102 devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev);
4103 break;
4104 default:
4105 pr_err("(%s): wrong irq type %d!",
4106 skd_name(skdev), skdev->irq_type);
4107 break;
4108 }
4109}
4110
4111/*
4112 *****************************************************************************
4113 * CONSTRUCT
4114 *****************************************************************************
4115 */
4116
4117static int skd_cons_skcomp(struct skd_device *skdev)
4118{
4119 int rc = 0;
4120 struct fit_completion_entry_v1 *skcomp;
4121 u32 nbytes;
4122
4123 nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY;
4124 nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY;
4125
4126 pr_debug("%s:%s:%d comp pci_alloc, total bytes %d entries %d\n",
4127 skdev->name, __func__, __LINE__,
4128 nbytes, SKD_N_COMPLETION_ENTRY);
4129
4130 skcomp = pci_alloc_consistent(skdev->pdev, nbytes,
4131 &skdev->cq_dma_address);
4132
4133 if (skcomp == NULL) {
4134 rc = -ENOMEM;
4135 goto err_out;
4136 }
4137
4138 memset(skcomp, 0, nbytes);
4139
4140 skdev->skcomp_table = skcomp;
4141 skdev->skerr_table = (struct fit_comp_error_info *)((char *)skcomp +
4142 sizeof(*skcomp) *
4143 SKD_N_COMPLETION_ENTRY);
4144
4145err_out:
4146 return rc;
4147}
4148
4149static int skd_cons_skmsg(struct skd_device *skdev)
4150{
4151 int rc = 0;
4152 u32 i;
4153
4154 pr_debug("%s:%s:%d skmsg_table kzalloc, struct %lu, count %u total %lu\n",
4155 skdev->name, __func__, __LINE__,
4156 sizeof(struct skd_fitmsg_context),
4157 skdev->num_fitmsg_context,
4158 sizeof(struct skd_fitmsg_context) * skdev->num_fitmsg_context);
4159
4160 skdev->skmsg_table = kzalloc(sizeof(struct skd_fitmsg_context)
4161 *skdev->num_fitmsg_context, GFP_KERNEL);
4162 if (skdev->skmsg_table == NULL) {
4163 rc = -ENOMEM;
4164 goto err_out;
4165 }
4166
4167 for (i = 0; i < skdev->num_fitmsg_context; i++) {
4168 struct skd_fitmsg_context *skmsg;
4169
4170 skmsg = &skdev->skmsg_table[i];
4171
4172 skmsg->id = i + SKD_ID_FIT_MSG;
4173
4174 skmsg->state = SKD_MSG_STATE_IDLE;
4175 skmsg->msg_buf = pci_alloc_consistent(skdev->pdev,
4176 SKD_N_FITMSG_BYTES + 64,
4177 &skmsg->mb_dma_address);
4178
4179 if (skmsg->msg_buf == NULL) {
4180 rc = -ENOMEM;
4181 goto err_out;
4182 }
4183
4184 skmsg->offset = (u32)((u64)skmsg->msg_buf &
4185 (~FIT_QCMD_BASE_ADDRESS_MASK));
4186 skmsg->msg_buf += ~FIT_QCMD_BASE_ADDRESS_MASK;
4187 skmsg->msg_buf = (u8 *)((u64)skmsg->msg_buf &
4188 FIT_QCMD_BASE_ADDRESS_MASK);
4189 skmsg->mb_dma_address += ~FIT_QCMD_BASE_ADDRESS_MASK;
4190 skmsg->mb_dma_address &= FIT_QCMD_BASE_ADDRESS_MASK;
4191 memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES);
4192
4193 skmsg->next = &skmsg[1];
4194 }
4195
4196 /* Free list is in order starting with the 0th entry. */
4197 skdev->skmsg_table[i - 1].next = NULL;
4198 skdev->skmsg_free_list = skdev->skmsg_table;
4199
4200err_out:
4201 return rc;
4202}
4203
4204static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev,
4205 u32 n_sg,
4206 dma_addr_t *ret_dma_addr)
4207{
4208 struct fit_sg_descriptor *sg_list;
4209 u32 nbytes;
4210
4211 nbytes = sizeof(*sg_list) * n_sg;
4212
4213 sg_list = pci_alloc_consistent(skdev->pdev, nbytes, ret_dma_addr);
4214
4215 if (sg_list != NULL) {
4216 uint64_t dma_address = *ret_dma_addr;
4217 u32 i;
4218
4219 memset(sg_list, 0, nbytes);
4220
4221 for (i = 0; i < n_sg - 1; i++) {
4222 uint64_t ndp_off;
4223 ndp_off = (i + 1) * sizeof(struct fit_sg_descriptor);
4224
4225 sg_list[i].next_desc_ptr = dma_address + ndp_off;
4226 }
4227 sg_list[i].next_desc_ptr = 0LL;
4228 }
4229
4230 return sg_list;
4231}
4232
4233static int skd_cons_skreq(struct skd_device *skdev)
4234{
4235 int rc = 0;
4236 u32 i;
4237
4238 pr_debug("%s:%s:%d skreq_table kzalloc, struct %lu, count %u total %lu\n",
4239 skdev->name, __func__, __LINE__,
4240 sizeof(struct skd_request_context),
4241 skdev->num_req_context,
4242 sizeof(struct skd_request_context) * skdev->num_req_context);
4243
4244 skdev->skreq_table = kzalloc(sizeof(struct skd_request_context)
4245 * skdev->num_req_context, GFP_KERNEL);
4246 if (skdev->skreq_table == NULL) {
4247 rc = -ENOMEM;
4248 goto err_out;
4249 }
4250
4251 pr_debug("%s:%s:%d alloc sg_table sg_per_req %u scatlist %lu total %lu\n",
4252 skdev->name, __func__, __LINE__,
4253 skdev->sgs_per_request, sizeof(struct scatterlist),
4254 skdev->sgs_per_request * sizeof(struct scatterlist));
4255
4256 for (i = 0; i < skdev->num_req_context; i++) {
4257 struct skd_request_context *skreq;
4258
4259 skreq = &skdev->skreq_table[i];
4260
4261 skreq->id = i + SKD_ID_RW_REQUEST;
4262 skreq->state = SKD_REQ_STATE_IDLE;
4263
4264 skreq->sg = kzalloc(sizeof(struct scatterlist) *
4265 skdev->sgs_per_request, GFP_KERNEL);
4266 if (skreq->sg == NULL) {
4267 rc = -ENOMEM;
4268 goto err_out;
4269 }
4270 sg_init_table(skreq->sg, skdev->sgs_per_request);
4271
4272 skreq->sksg_list = skd_cons_sg_list(skdev,
4273 skdev->sgs_per_request,
4274 &skreq->sksg_dma_address);
4275
4276 if (skreq->sksg_list == NULL) {
4277 rc = -ENOMEM;
4278 goto err_out;
4279 }
4280
4281 skreq->next = &skreq[1];
4282 }
4283
4284 /* Free list is in order starting with the 0th entry. */
4285 skdev->skreq_table[i - 1].next = NULL;
4286 skdev->skreq_free_list = skdev->skreq_table;
4287
4288err_out:
4289 return rc;
4290}
4291
4292static int skd_cons_skspcl(struct skd_device *skdev)
4293{
4294 int rc = 0;
4295 u32 i, nbytes;
4296
4297 pr_debug("%s:%s:%d skspcl_table kzalloc, struct %lu, count %u total %lu\n",
4298 skdev->name, __func__, __LINE__,
4299 sizeof(struct skd_special_context),
4300 skdev->n_special,
4301 sizeof(struct skd_special_context) * skdev->n_special);
4302
4303 skdev->skspcl_table = kzalloc(sizeof(struct skd_special_context)
4304 * skdev->n_special, GFP_KERNEL);
4305 if (skdev->skspcl_table == NULL) {
4306 rc = -ENOMEM;
4307 goto err_out;
4308 }
4309
4310 for (i = 0; i < skdev->n_special; i++) {
4311 struct skd_special_context *skspcl;
4312
4313 skspcl = &skdev->skspcl_table[i];
4314
4315 skspcl->req.id = i + SKD_ID_SPECIAL_REQUEST;
4316 skspcl->req.state = SKD_REQ_STATE_IDLE;
4317
4318 skspcl->req.next = &skspcl[1].req;
4319
4320 nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
4321
4322 skspcl->msg_buf = pci_alloc_consistent(skdev->pdev, nbytes,
4323 &skspcl->mb_dma_address);
4324 if (skspcl->msg_buf == NULL) {
4325 rc = -ENOMEM;
4326 goto err_out;
4327 }
4328
4329 memset(skspcl->msg_buf, 0, nbytes);
4330
4331 skspcl->req.sg = kzalloc(sizeof(struct scatterlist) *
4332 SKD_N_SG_PER_SPECIAL, GFP_KERNEL);
4333 if (skspcl->req.sg == NULL) {
4334 rc = -ENOMEM;
4335 goto err_out;
4336 }
4337
4338 skspcl->req.sksg_list = skd_cons_sg_list(skdev,
4339 SKD_N_SG_PER_SPECIAL,
4340 &skspcl->req.
4341 sksg_dma_address);
4342 if (skspcl->req.sksg_list == NULL) {
4343 rc = -ENOMEM;
4344 goto err_out;
4345 }
4346 }
4347
4348 /* Free list is in order starting with the 0th entry. */
4349 skdev->skspcl_table[i - 1].req.next = NULL;
4350 skdev->skspcl_free_list = skdev->skspcl_table;
4351
4352 return rc;
4353
4354err_out:
4355 return rc;
4356}
4357
4358static int skd_cons_sksb(struct skd_device *skdev)
4359{
4360 int rc = 0;
4361 struct skd_special_context *skspcl;
4362 u32 nbytes;
4363
4364 skspcl = &skdev->internal_skspcl;
4365
4366 skspcl->req.id = 0 + SKD_ID_INTERNAL;
4367 skspcl->req.state = SKD_REQ_STATE_IDLE;
4368
4369 nbytes = SKD_N_INTERNAL_BYTES;
4370
4371 skspcl->data_buf = pci_alloc_consistent(skdev->pdev, nbytes,
4372 &skspcl->db_dma_address);
4373 if (skspcl->data_buf == NULL) {
4374 rc = -ENOMEM;
4375 goto err_out;
4376 }
4377
4378 memset(skspcl->data_buf, 0, nbytes);
4379
4380 nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
4381 skspcl->msg_buf = pci_alloc_consistent(skdev->pdev, nbytes,
4382 &skspcl->mb_dma_address);
4383 if (skspcl->msg_buf == NULL) {
4384 rc = -ENOMEM;
4385 goto err_out;
4386 }
4387
4388 memset(skspcl->msg_buf, 0, nbytes);
4389
4390 skspcl->req.sksg_list = skd_cons_sg_list(skdev, 1,
4391 &skspcl->req.sksg_dma_address);
4392 if (skspcl->req.sksg_list == NULL) {
4393 rc = -ENOMEM;
4394 goto err_out;
4395 }
4396
4397 if (!skd_format_internal_skspcl(skdev)) {
4398 rc = -EINVAL;
4399 goto err_out;
4400 }
4401
4402err_out:
4403 return rc;
4404}
4405
4406static int skd_cons_disk(struct skd_device *skdev)
4407{
4408 int rc = 0;
4409 struct gendisk *disk;
4410 struct request_queue *q;
4411 unsigned long flags;
4412
4413 disk = alloc_disk(SKD_MINORS_PER_DEVICE);
4414 if (!disk) {
4415 rc = -ENOMEM;
4416 goto err_out;
4417 }
4418
4419 skdev->disk = disk;
4420 sprintf(disk->disk_name, DRV_NAME "%u", skdev->devno);
4421
4422 disk->major = skdev->major;
4423 disk->first_minor = skdev->devno * SKD_MINORS_PER_DEVICE;
4424 disk->fops = &skd_blockdev_ops;
4425 disk->private_data = skdev;
4426
4427 q = blk_init_queue(skd_request_fn, &skdev->lock);
4428 if (!q) {
4429 rc = -ENOMEM;
4430 goto err_out;
4431 }
4432
4433 skdev->queue = q;
4434 disk->queue = q;
4435 q->queuedata = skdev;
4436
4437 blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
4438 blk_queue_max_segments(q, skdev->sgs_per_request);
4439 blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS);
4440
4441 /* set sysfs ptimal_io_size to 8K */
4442 blk_queue_io_opt(q, 8192);
4443
4444 /* DISCARD Flag initialization. */
4445 q->limits.discard_granularity = 8192;
4446 q->limits.discard_alignment = 0;
4447 q->limits.max_discard_sectors = UINT_MAX >> 9;
4448 q->limits.discard_zeroes_data = 1;
4449 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
4450 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
4451
4452 spin_lock_irqsave(&skdev->lock, flags);
4453 pr_debug("%s:%s:%d stopping %s queue\n",
4454 skdev->name, __func__, __LINE__, skdev->name);
4455 blk_stop_queue(skdev->queue);
4456 spin_unlock_irqrestore(&skdev->lock, flags);
4457
4458err_out:
4459 return rc;
4460}
4461
4462#define SKD_N_DEV_TABLE 16u
4463static u32 skd_next_devno;
4464
4465static struct skd_device *skd_construct(struct pci_dev *pdev)
4466{
4467 struct skd_device *skdev;
4468 int blk_major = skd_major;
4469 int rc;
4470
4471 skdev = kzalloc(sizeof(*skdev), GFP_KERNEL);
4472
4473 if (!skdev) {
4474 pr_err(PFX "(%s): memory alloc failure\n",
4475 pci_name(pdev));
4476 return NULL;
4477 }
4478
4479 skdev->state = SKD_DRVR_STATE_LOAD;
4480 skdev->pdev = pdev;
4481 skdev->devno = skd_next_devno++;
4482 skdev->major = blk_major;
4483 skdev->irq_type = skd_isr_type;
4484 sprintf(skdev->name, DRV_NAME "%d", skdev->devno);
4485 skdev->dev_max_queue_depth = 0;
4486
4487 skdev->num_req_context = skd_max_queue_depth;
4488 skdev->num_fitmsg_context = skd_max_queue_depth;
4489 skdev->n_special = skd_max_pass_thru;
4490 skdev->cur_max_queue_depth = 1;
4491 skdev->queue_low_water_mark = 1;
4492 skdev->proto_ver = 99;
4493 skdev->sgs_per_request = skd_sgs_per_request;
4494 skdev->dbg_level = skd_dbg_level;
4495
4496 atomic_set(&skdev->device_count, 0);
4497
4498 spin_lock_init(&skdev->lock);
4499
4500 INIT_WORK(&skdev->completion_worker, skd_completion_worker);
4501
4502 pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__);
4503 rc = skd_cons_skcomp(skdev);
4504 if (rc < 0)
4505 goto err_out;
4506
4507 pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__);
4508 rc = skd_cons_skmsg(skdev);
4509 if (rc < 0)
4510 goto err_out;
4511
4512 pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__);
4513 rc = skd_cons_skreq(skdev);
4514 if (rc < 0)
4515 goto err_out;
4516
4517 pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__);
4518 rc = skd_cons_skspcl(skdev);
4519 if (rc < 0)
4520 goto err_out;
4521
4522 pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__);
4523 rc = skd_cons_sksb(skdev);
4524 if (rc < 0)
4525 goto err_out;
4526
4527 pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__);
4528 rc = skd_cons_disk(skdev);
4529 if (rc < 0)
4530 goto err_out;
4531
4532 pr_debug("%s:%s:%d VICTORY\n", skdev->name, __func__, __LINE__);
4533 return skdev;
4534
4535err_out:
4536 pr_debug("%s:%s:%d construct failed\n",
4537 skdev->name, __func__, __LINE__);
4538 skd_destruct(skdev);
4539 return NULL;
4540}
4541
4542/*
4543 *****************************************************************************
4544 * DESTRUCT (FREE)
4545 *****************************************************************************
4546 */
4547
4548static void skd_free_skcomp(struct skd_device *skdev)
4549{
4550 if (skdev->skcomp_table != NULL) {
4551 u32 nbytes;
4552
4553 nbytes = sizeof(skdev->skcomp_table[0]) *
4554 SKD_N_COMPLETION_ENTRY;
4555 pci_free_consistent(skdev->pdev, nbytes,
4556 skdev->skcomp_table, skdev->cq_dma_address);
4557 }
4558
4559 skdev->skcomp_table = NULL;
4560 skdev->cq_dma_address = 0;
4561}
4562
4563static void skd_free_skmsg(struct skd_device *skdev)
4564{
4565 u32 i;
4566
4567 if (skdev->skmsg_table == NULL)
4568 return;
4569
4570 for (i = 0; i < skdev->num_fitmsg_context; i++) {
4571 struct skd_fitmsg_context *skmsg;
4572
4573 skmsg = &skdev->skmsg_table[i];
4574
4575 if (skmsg->msg_buf != NULL) {
4576 skmsg->msg_buf += skmsg->offset;
4577 skmsg->mb_dma_address += skmsg->offset;
4578 pci_free_consistent(skdev->pdev, SKD_N_FITMSG_BYTES,
4579 skmsg->msg_buf,
4580 skmsg->mb_dma_address);
4581 }
4582 skmsg->msg_buf = NULL;
4583 skmsg->mb_dma_address = 0;
4584 }
4585
4586 kfree(skdev->skmsg_table);
4587 skdev->skmsg_table = NULL;
4588}
4589
4590static void skd_free_sg_list(struct skd_device *skdev,
4591 struct fit_sg_descriptor *sg_list,
4592 u32 n_sg, dma_addr_t dma_addr)
4593{
4594 if (sg_list != NULL) {
4595 u32 nbytes;
4596
4597 nbytes = sizeof(*sg_list) * n_sg;
4598
4599 pci_free_consistent(skdev->pdev, nbytes, sg_list, dma_addr);
4600 }
4601}
4602
4603static void skd_free_skreq(struct skd_device *skdev)
4604{
4605 u32 i;
4606
4607 if (skdev->skreq_table == NULL)
4608 return;
4609
4610 for (i = 0; i < skdev->num_req_context; i++) {
4611 struct skd_request_context *skreq;
4612
4613 skreq = &skdev->skreq_table[i];
4614
4615 skd_free_sg_list(skdev, skreq->sksg_list,
4616 skdev->sgs_per_request,
4617 skreq->sksg_dma_address);
4618
4619 skreq->sksg_list = NULL;
4620 skreq->sksg_dma_address = 0;
4621
4622 kfree(skreq->sg);
4623 }
4624
4625 kfree(skdev->skreq_table);
4626 skdev->skreq_table = NULL;
4627}
4628
4629static void skd_free_skspcl(struct skd_device *skdev)
4630{
4631 u32 i;
4632 u32 nbytes;
4633
4634 if (skdev->skspcl_table == NULL)
4635 return;
4636
4637 for (i = 0; i < skdev->n_special; i++) {
4638 struct skd_special_context *skspcl;
4639
4640 skspcl = &skdev->skspcl_table[i];
4641
4642 if (skspcl->msg_buf != NULL) {
4643 nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
4644 pci_free_consistent(skdev->pdev, nbytes,
4645 skspcl->msg_buf,
4646 skspcl->mb_dma_address);
4647 }
4648
4649 skspcl->msg_buf = NULL;
4650 skspcl->mb_dma_address = 0;
4651
4652 skd_free_sg_list(skdev, skspcl->req.sksg_list,
4653 SKD_N_SG_PER_SPECIAL,
4654 skspcl->req.sksg_dma_address);
4655
4656 skspcl->req.sksg_list = NULL;
4657 skspcl->req.sksg_dma_address = 0;
4658
4659 kfree(skspcl->req.sg);
4660 }
4661
4662 kfree(skdev->skspcl_table);
4663 skdev->skspcl_table = NULL;
4664}
4665
4666static void skd_free_sksb(struct skd_device *skdev)
4667{
4668 struct skd_special_context *skspcl;
4669 u32 nbytes;
4670
4671 skspcl = &skdev->internal_skspcl;
4672
4673 if (skspcl->data_buf != NULL) {
4674 nbytes = SKD_N_INTERNAL_BYTES;
4675
4676 pci_free_consistent(skdev->pdev, nbytes,
4677 skspcl->data_buf, skspcl->db_dma_address);
4678 }
4679
4680 skspcl->data_buf = NULL;
4681 skspcl->db_dma_address = 0;
4682
4683 if (skspcl->msg_buf != NULL) {
4684 nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
4685 pci_free_consistent(skdev->pdev, nbytes,
4686 skspcl->msg_buf, skspcl->mb_dma_address);
4687 }
4688
4689 skspcl->msg_buf = NULL;
4690 skspcl->mb_dma_address = 0;
4691
4692 skd_free_sg_list(skdev, skspcl->req.sksg_list, 1,
4693 skspcl->req.sksg_dma_address);
4694
4695 skspcl->req.sksg_list = NULL;
4696 skspcl->req.sksg_dma_address = 0;
4697}
4698
4699static void skd_free_disk(struct skd_device *skdev)
4700{
4701 struct gendisk *disk = skdev->disk;
4702
4703 if (disk != NULL) {
4704 struct request_queue *q = disk->queue;
4705
4706 if (disk->flags & GENHD_FL_UP)
4707 del_gendisk(disk);
4708 if (q)
4709 blk_cleanup_queue(q);
4710 put_disk(disk);
4711 }
4712 skdev->disk = NULL;
4713}
4714
4715static void skd_destruct(struct skd_device *skdev)
4716{
4717 if (skdev == NULL)
4718 return;
4719
4720
4721 pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__);
4722 skd_free_disk(skdev);
4723
4724 pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__);
4725 skd_free_sksb(skdev);
4726
4727 pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__);
4728 skd_free_skspcl(skdev);
4729
4730 pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__);
4731 skd_free_skreq(skdev);
4732
4733 pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__);
4734 skd_free_skmsg(skdev);
4735
4736 pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__);
4737 skd_free_skcomp(skdev);
4738
4739 pr_debug("%s:%s:%d skdev\n", skdev->name, __func__, __LINE__);
4740 kfree(skdev);
4741}
4742
4743/*
4744 *****************************************************************************
4745 * BLOCK DEVICE (BDEV) GLUE
4746 *****************************************************************************
4747 */
4748
4749static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
4750{
4751 struct skd_device *skdev;
4752 u64 capacity;
4753
4754 skdev = bdev->bd_disk->private_data;
4755
4756 pr_debug("%s:%s:%d %s: CMD[%s] getgeo device\n",
4757 skdev->name, __func__, __LINE__,
4758 bdev->bd_disk->disk_name, current->comm);
4759
4760 if (skdev->read_cap_is_valid) {
4761 capacity = get_capacity(skdev->disk);
4762 geo->heads = 64;
4763 geo->sectors = 255;
4764 geo->cylinders = (capacity) / (255 * 64);
4765
4766 return 0;
4767 }
4768 return -EIO;
4769}
4770
4771static int skd_bdev_attach(struct skd_device *skdev)
4772{
4773 pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__);
4774 add_disk(skdev->disk);
4775 return 0;
4776}
4777
4778static const struct block_device_operations skd_blockdev_ops = {
4779 .owner = THIS_MODULE,
4780 .ioctl = skd_bdev_ioctl,
4781 .getgeo = skd_bdev_getgeo,
4782};
4783
4784
4785/*
4786 *****************************************************************************
4787 * PCIe DRIVER GLUE
4788 *****************************************************************************
4789 */
4790
4791static DEFINE_PCI_DEVICE_TABLE(skd_pci_tbl) = {
4792 { PCI_VENDOR_ID_STEC, PCI_DEVICE_ID_S1120,
4793 PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
4794 { 0 } /* terminate list */
4795};
4796
4797MODULE_DEVICE_TABLE(pci, skd_pci_tbl);
4798
4799static char *skd_pci_info(struct skd_device *skdev, char *str)
4800{
4801 int pcie_reg;
4802
4803 strcpy(str, "PCIe (");
4804 pcie_reg = pci_find_capability(skdev->pdev, PCI_CAP_ID_EXP);
4805
4806 if (pcie_reg) {
4807
4808 char lwstr[6];
4809 uint16_t pcie_lstat, lspeed, lwidth;
4810
4811 pcie_reg += 0x12;
4812 pci_read_config_word(skdev->pdev, pcie_reg, &pcie_lstat);
4813 lspeed = pcie_lstat & (0xF);
4814 lwidth = (pcie_lstat & 0x3F0) >> 4;
4815
4816 if (lspeed == 1)
4817 strcat(str, "2.5GT/s ");
4818 else if (lspeed == 2)
4819 strcat(str, "5.0GT/s ");
4820 else
4821 strcat(str, "<unknown> ");
4822 snprintf(lwstr, sizeof(lwstr), "%dX)", lwidth);
4823 strcat(str, lwstr);
4824 }
4825 return str;
4826}
4827
4828static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
4829{
4830 int i;
4831 int rc = 0;
4832 char pci_str[32];
4833 struct skd_device *skdev;
4834
4835 pr_info("STEC s1120 Driver(%s) version %s-b%s\n",
4836 DRV_NAME, DRV_VERSION, DRV_BUILD_ID);
4837 pr_info("(skd?:??:[%s]): vendor=%04X device=%04x\n",
4838 pci_name(pdev), pdev->vendor, pdev->device);
4839
4840 rc = pci_enable_device(pdev);
4841 if (rc)
4842 return rc;
4843 rc = pci_request_regions(pdev, DRV_NAME);
4844 if (rc)
4845 goto err_out;
4846 rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
4847 if (!rc) {
4848 if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {
4849
4850 pr_err("(%s): consistent DMA mask error %d\n",
4851 pci_name(pdev), rc);
4852 }
4853 } else {
4854 (rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)));
4855 if (rc) {
4856
4857 pr_err("(%s): DMA mask error %d\n",
4858 pci_name(pdev), rc);
4859 goto err_out_regions;
4860 }
4861 }
4862
4863 if (!skd_major) {
4864 rc = register_blkdev(0, DRV_NAME);
4865 if (rc < 0)
4866 goto err_out_regions;
4867 BUG_ON(!rc);
4868 skd_major = rc;
4869 }
4870
4871 skdev = skd_construct(pdev);
4872 if (skdev == NULL) {
4873 rc = -ENOMEM;
4874 goto err_out_regions;
4875 }
4876
4877 skd_pci_info(skdev, pci_str);
4878 pr_info("(%s): %s 64bit\n", skd_name(skdev), pci_str);
4879
4880 pci_set_master(pdev);
4881 rc = pci_enable_pcie_error_reporting(pdev);
4882 if (rc) {
4883 pr_err(
4884 "(%s): bad enable of PCIe error reporting rc=%d\n",
4885 skd_name(skdev), rc);
4886 skdev->pcie_error_reporting_is_enabled = 0;
4887 } else
4888 skdev->pcie_error_reporting_is_enabled = 1;
4889
4890
4891 pci_set_drvdata(pdev, skdev);
4892
4893 skdev->disk->driverfs_dev = &pdev->dev;
4894
4895 for (i = 0; i < SKD_MAX_BARS; i++) {
4896 skdev->mem_phys[i] = pci_resource_start(pdev, i);
4897 skdev->mem_size[i] = (u32)pci_resource_len(pdev, i);
4898 skdev->mem_map[i] = ioremap(skdev->mem_phys[i],
4899 skdev->mem_size[i]);
4900 if (!skdev->mem_map[i]) {
4901 pr_err("(%s): Unable to map adapter memory!\n",
4902 skd_name(skdev));
4903 rc = -ENODEV;
4904 goto err_out_iounmap;
4905 }
4906 pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n",
4907 skdev->name, __func__, __LINE__,
4908 skdev->mem_map[i],
4909 (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]);
4910 }
4911
4912 rc = skd_acquire_irq(skdev);
4913 if (rc) {
4914 pr_err("(%s): interrupt resource error %d\n",
4915 skd_name(skdev), rc);
4916 goto err_out_iounmap;
4917 }
4918
4919 rc = skd_start_timer(skdev);
4920 if (rc)
4921 goto err_out_timer;
4922
4923 init_waitqueue_head(&skdev->waitq);
4924
4925 skd_start_device(skdev);
4926
4927 rc = wait_event_interruptible_timeout(skdev->waitq,
4928 (skdev->gendisk_on),
4929 (SKD_START_WAIT_SECONDS * HZ));
4930 if (skdev->gendisk_on > 0) {
4931 /* device came on-line after reset */
4932 skd_bdev_attach(skdev);
4933 rc = 0;
4934 } else {
4935 /* we timed out, something is wrong with the device,
4936 don't add the disk structure */
4937 pr_err(
4938 "(%s): error: waiting for s1120 timed out %d!\n",
4939 skd_name(skdev), rc);
4940 /* in case of no error; we timeout with ENXIO */
4941 if (!rc)
4942 rc = -ENXIO;
4943 goto err_out_timer;
4944 }
4945
4946
4947#ifdef SKD_VMK_POLL_HANDLER
4948 if (skdev->irq_type == SKD_IRQ_MSIX) {
4949 /* MSIX completion handler is being used for coredump */
4950 vmklnx_scsi_register_poll_handler(skdev->scsi_host,
4951 skdev->msix_entries[5].vector,
4952 skd_comp_q, skdev);
4953 } else {
4954 vmklnx_scsi_register_poll_handler(skdev->scsi_host,
4955 skdev->pdev->irq, skd_isr,
4956 skdev);
4957 }
4958#endif /* SKD_VMK_POLL_HANDLER */
4959
4960 return rc;
4961
4962err_out_timer:
4963 skd_stop_device(skdev);
4964 skd_release_irq(skdev);
4965
4966err_out_iounmap:
4967 for (i = 0; i < SKD_MAX_BARS; i++)
4968 if (skdev->mem_map[i])
4969 iounmap(skdev->mem_map[i]);
4970
4971 if (skdev->pcie_error_reporting_is_enabled)
4972 pci_disable_pcie_error_reporting(pdev);
4973
4974 skd_destruct(skdev);
4975
4976err_out_regions:
4977 pci_release_regions(pdev);
4978
4979err_out:
4980 pci_disable_device(pdev);
4981 pci_set_drvdata(pdev, NULL);
4982 return rc;
4983}
4984
4985static void skd_pci_remove(struct pci_dev *pdev)
4986{
4987 int i;
4988 struct skd_device *skdev;
4989
4990 skdev = pci_get_drvdata(pdev);
4991 if (!skdev) {
4992 pr_err("%s: no device data for PCI\n", pci_name(pdev));
4993 return;
4994 }
4995 skd_stop_device(skdev);
4996 skd_release_irq(skdev);
4997
4998 for (i = 0; i < SKD_MAX_BARS; i++)
4999 if (skdev->mem_map[i])
5000 iounmap((u32 *)skdev->mem_map[i]);
5001
5002 if (skdev->pcie_error_reporting_is_enabled)
5003 pci_disable_pcie_error_reporting(pdev);
5004
5005 skd_destruct(skdev);
5006
5007 pci_release_regions(pdev);
5008 pci_disable_device(pdev);
5009 pci_set_drvdata(pdev, NULL);
5010
5011 return;
5012}
5013
5014static int skd_pci_suspend(struct pci_dev *pdev, pm_message_t state)
5015{
5016 int i;
5017 struct skd_device *skdev;
5018
5019 skdev = pci_get_drvdata(pdev);
5020 if (!skdev) {
5021 pr_err("%s: no device data for PCI\n", pci_name(pdev));
5022 return -EIO;
5023 }
5024
5025 skd_stop_device(skdev);
5026
5027 skd_release_irq(skdev);
5028
5029 for (i = 0; i < SKD_MAX_BARS; i++)
5030 if (skdev->mem_map[i])
5031 iounmap((u32 *)skdev->mem_map[i]);
5032
5033 if (skdev->pcie_error_reporting_is_enabled)
5034 pci_disable_pcie_error_reporting(pdev);
5035
5036 pci_release_regions(pdev);
5037 pci_save_state(pdev);
5038 pci_disable_device(pdev);
5039 pci_set_power_state(pdev, pci_choose_state(pdev, state));
5040 return 0;
5041}
5042
5043static int skd_pci_resume(struct pci_dev *pdev)
5044{
5045 int i;
5046 int rc = 0;
5047 struct skd_device *skdev;
5048
5049 skdev = pci_get_drvdata(pdev);
5050 if (!skdev) {
5051 pr_err("%s: no device data for PCI\n", pci_name(pdev));
5052 return -1;
5053 }
5054
5055 pci_set_power_state(pdev, PCI_D0);
5056 pci_enable_wake(pdev, PCI_D0, 0);
5057 pci_restore_state(pdev);
5058
5059 rc = pci_enable_device(pdev);
5060 if (rc)
5061 return rc;
5062 rc = pci_request_regions(pdev, DRV_NAME);
5063 if (rc)
5064 goto err_out;
5065 rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
5066 if (!rc) {
5067 if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {
5068
5069 pr_err("(%s): consistent DMA mask error %d\n",
5070 pci_name(pdev), rc);
5071 }
5072 } else {
5073 rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
5074 if (rc) {
5075
5076 pr_err("(%s): DMA mask error %d\n",
5077 pci_name(pdev), rc);
5078 goto err_out_regions;
5079 }
5080 }
5081
5082 pci_set_master(pdev);
5083 rc = pci_enable_pcie_error_reporting(pdev);
5084 if (rc) {
5085 pr_err("(%s): bad enable of PCIe error reporting rc=%d\n",
5086 skdev->name, rc);
5087 skdev->pcie_error_reporting_is_enabled = 0;
5088 } else
5089 skdev->pcie_error_reporting_is_enabled = 1;
5090
5091 for (i = 0; i < SKD_MAX_BARS; i++) {
5092
5093 skdev->mem_phys[i] = pci_resource_start(pdev, i);
5094 skdev->mem_size[i] = (u32)pci_resource_len(pdev, i);
5095 skdev->mem_map[i] = ioremap(skdev->mem_phys[i],
5096 skdev->mem_size[i]);
5097 if (!skdev->mem_map[i]) {
5098 pr_err("(%s): Unable to map adapter memory!\n",
5099 skd_name(skdev));
5100 rc = -ENODEV;
5101 goto err_out_iounmap;
5102 }
5103 pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n",
5104 skdev->name, __func__, __LINE__,
5105 skdev->mem_map[i],
5106 (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]);
5107 }
5108 rc = skd_acquire_irq(skdev);
5109 if (rc) {
5110
5111 pr_err("(%s): interrupt resource error %d\n",
5112 pci_name(pdev), rc);
5113 goto err_out_iounmap;
5114 }
5115
5116 rc = skd_start_timer(skdev);
5117 if (rc)
5118 goto err_out_timer;
5119
5120 init_waitqueue_head(&skdev->waitq);
5121
5122 skd_start_device(skdev);
5123
5124 return rc;
5125
5126err_out_timer:
5127 skd_stop_device(skdev);
5128 skd_release_irq(skdev);
5129
5130err_out_iounmap:
5131 for (i = 0; i < SKD_MAX_BARS; i++)
5132 if (skdev->mem_map[i])
5133 iounmap(skdev->mem_map[i]);
5134
5135 if (skdev->pcie_error_reporting_is_enabled)
5136 pci_disable_pcie_error_reporting(pdev);
5137
5138err_out_regions:
5139 pci_release_regions(pdev);
5140
5141err_out:
5142 pci_disable_device(pdev);
5143 return rc;
5144}
5145
5146static void skd_pci_shutdown(struct pci_dev *pdev)
5147{
5148 struct skd_device *skdev;
5149
5150 pr_err("skd_pci_shutdown called\n");
5151
5152 skdev = pci_get_drvdata(pdev);
5153 if (!skdev) {
5154 pr_err("%s: no device data for PCI\n", pci_name(pdev));
5155 return;
5156 }
5157
5158 pr_err("%s: calling stop\n", skd_name(skdev));
5159 skd_stop_device(skdev);
5160}
5161
5162static struct pci_driver skd_driver = {
5163 .name = DRV_NAME,
5164 .id_table = skd_pci_tbl,
5165 .probe = skd_pci_probe,
5166 .remove = skd_pci_remove,
5167 .suspend = skd_pci_suspend,
5168 .resume = skd_pci_resume,
5169 .shutdown = skd_pci_shutdown,
5170};
5171
5172/*
5173 *****************************************************************************
5174 * LOGGING SUPPORT
5175 *****************************************************************************
5176 */
5177
5178static const char *skd_name(struct skd_device *skdev)
5179{
5180 memset(skdev->id_str, 0, sizeof(skdev->id_str));
5181
5182 if (skdev->inquiry_is_valid)
5183 snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:%s:[%s]",
5184 skdev->name, skdev->inq_serial_num,
5185 pci_name(skdev->pdev));
5186 else
5187 snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:??:[%s]",
5188 skdev->name, pci_name(skdev->pdev));
5189
5190 return skdev->id_str;
5191}
5192
5193const char *skd_drive_state_to_str(int state)
5194{
5195 switch (state) {
5196 case FIT_SR_DRIVE_OFFLINE:
5197 return "OFFLINE";
5198 case FIT_SR_DRIVE_INIT:
5199 return "INIT";
5200 case FIT_SR_DRIVE_ONLINE:
5201 return "ONLINE";
5202 case FIT_SR_DRIVE_BUSY:
5203 return "BUSY";
5204 case FIT_SR_DRIVE_FAULT:
5205 return "FAULT";
5206 case FIT_SR_DRIVE_DEGRADED:
5207 return "DEGRADED";
5208 case FIT_SR_PCIE_LINK_DOWN:
5209 return "INK_DOWN";
5210 case FIT_SR_DRIVE_SOFT_RESET:
5211 return "SOFT_RESET";
5212 case FIT_SR_DRIVE_NEED_FW_DOWNLOAD:
5213 return "NEED_FW";
5214 case FIT_SR_DRIVE_INIT_FAULT:
5215 return "INIT_FAULT";
5216 case FIT_SR_DRIVE_BUSY_SANITIZE:
5217 return "BUSY_SANITIZE";
5218 case FIT_SR_DRIVE_BUSY_ERASE:
5219 return "BUSY_ERASE";
5220 case FIT_SR_DRIVE_FW_BOOTING:
5221 return "FW_BOOTING";
5222 default:
5223 return "???";
5224 }
5225}
5226
5227const char *skd_skdev_state_to_str(enum skd_drvr_state state)
5228{
5229 switch (state) {
5230 case SKD_DRVR_STATE_LOAD:
5231 return "LOAD";
5232 case SKD_DRVR_STATE_IDLE:
5233 return "IDLE";
5234 case SKD_DRVR_STATE_BUSY:
5235 return "BUSY";
5236 case SKD_DRVR_STATE_STARTING:
5237 return "STARTING";
5238 case SKD_DRVR_STATE_ONLINE:
5239 return "ONLINE";
5240 case SKD_DRVR_STATE_PAUSING:
5241 return "PAUSING";
5242 case SKD_DRVR_STATE_PAUSED:
5243 return "PAUSED";
5244 case SKD_DRVR_STATE_DRAINING_TIMEOUT:
5245 return "DRAINING_TIMEOUT";
5246 case SKD_DRVR_STATE_RESTARTING:
5247 return "RESTARTING";
5248 case SKD_DRVR_STATE_RESUMING:
5249 return "RESUMING";
5250 case SKD_DRVR_STATE_STOPPING:
5251 return "STOPPING";
5252 case SKD_DRVR_STATE_SYNCING:
5253 return "SYNCING";
5254 case SKD_DRVR_STATE_FAULT:
5255 return "FAULT";
5256 case SKD_DRVR_STATE_DISAPPEARED:
5257 return "DISAPPEARED";
5258 case SKD_DRVR_STATE_BUSY_ERASE:
5259 return "BUSY_ERASE";
5260 case SKD_DRVR_STATE_BUSY_SANITIZE:
5261 return "BUSY_SANITIZE";
5262 case SKD_DRVR_STATE_BUSY_IMMINENT:
5263 return "BUSY_IMMINENT";
5264 case SKD_DRVR_STATE_WAIT_BOOT:
5265 return "WAIT_BOOT";
5266
5267 default:
5268 return "???";
5269 }
5270}
5271
5272const char *skd_skmsg_state_to_str(enum skd_fit_msg_state state)
5273{
5274 switch (state) {
5275 case SKD_MSG_STATE_IDLE:
5276 return "IDLE";
5277 case SKD_MSG_STATE_BUSY:
5278 return "BUSY";
5279 default:
5280 return "???";
5281 }
5282}
5283
5284const char *skd_skreq_state_to_str(enum skd_req_state state)
5285{
5286 switch (state) {
5287 case SKD_REQ_STATE_IDLE:
5288 return "IDLE";
5289 case SKD_REQ_STATE_SETUP:
5290 return "SETUP";
5291 case SKD_REQ_STATE_BUSY:
5292 return "BUSY";
5293 case SKD_REQ_STATE_COMPLETED:
5294 return "COMPLETED";
5295 case SKD_REQ_STATE_TIMEOUT:
5296 return "TIMEOUT";
5297 case SKD_REQ_STATE_ABORTED:
5298 return "ABORTED";
5299 default:
5300 return "???";
5301 }
5302}
5303
5304static void skd_log_skdev(struct skd_device *skdev, const char *event)
5305{
5306 pr_debug("%s:%s:%d (%s) skdev=%p event='%s'\n",
5307 skdev->name, __func__, __LINE__, skdev->name, skdev, event);
5308 pr_debug("%s:%s:%d drive_state=%s(%d) driver_state=%s(%d)\n",
5309 skdev->name, __func__, __LINE__,
5310 skd_drive_state_to_str(skdev->drive_state), skdev->drive_state,
5311 skd_skdev_state_to_str(skdev->state), skdev->state);
5312 pr_debug("%s:%s:%d busy=%d limit=%d dev=%d lowat=%d\n",
5313 skdev->name, __func__, __LINE__,
5314 skdev->in_flight, skdev->cur_max_queue_depth,
5315 skdev->dev_max_queue_depth, skdev->queue_low_water_mark);
5316 pr_debug("%s:%s:%d timestamp=0x%x cycle=%d cycle_ix=%d\n",
5317 skdev->name, __func__, __LINE__,
5318 skdev->timeout_stamp, skdev->skcomp_cycle, skdev->skcomp_ix);
5319}
5320
5321static void skd_log_skmsg(struct skd_device *skdev,
5322 struct skd_fitmsg_context *skmsg, const char *event)
5323{
5324 pr_debug("%s:%s:%d (%s) skmsg=%p event='%s'\n",
5325 skdev->name, __func__, __LINE__, skdev->name, skmsg, event);
5326 pr_debug("%s:%s:%d state=%s(%d) id=0x%04x length=%d\n",
5327 skdev->name, __func__, __LINE__,
5328 skd_skmsg_state_to_str(skmsg->state), skmsg->state,
5329 skmsg->id, skmsg->length);
5330}
5331
5332static void skd_log_skreq(struct skd_device *skdev,
5333 struct skd_request_context *skreq, const char *event)
5334{
5335 pr_debug("%s:%s:%d (%s) skreq=%p event='%s'\n",
5336 skdev->name, __func__, __LINE__, skdev->name, skreq, event);
5337 pr_debug("%s:%s:%d state=%s(%d) id=0x%04x fitmsg=0x%04x\n",
5338 skdev->name, __func__, __LINE__,
5339 skd_skreq_state_to_str(skreq->state), skreq->state,
5340 skreq->id, skreq->fitmsg_id);
5341 pr_debug("%s:%s:%d timo=0x%x sg_dir=%d n_sg=%d\n",
5342 skdev->name, __func__, __LINE__,
5343 skreq->timeout_stamp, skreq->sg_data_dir, skreq->n_sg);
5344
5345 if (skreq->req != NULL) {
5346 struct request *req = skreq->req;
5347 u32 lba = (u32)blk_rq_pos(req);
5348 u32 count = blk_rq_sectors(req);
5349
5350 pr_debug("%s:%s:%d "
5351 "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
5352 skdev->name, __func__, __LINE__,
5353 req, lba, lba, count, count,
5354 (int)rq_data_dir(req));
5355 } else
5356 pr_debug("%s:%s:%d req=NULL\n",
5357 skdev->name, __func__, __LINE__);
5358}
5359
5360/*
5361 *****************************************************************************
5362 * MODULE GLUE
5363 *****************************************************************************
5364 */
5365
5366static int __init skd_init(void)
5367{
5368 pr_info(PFX " v%s-b%s loaded\n", DRV_VERSION, DRV_BUILD_ID);
5369
5370 switch (skd_isr_type) {
5371 case SKD_IRQ_LEGACY:
5372 case SKD_IRQ_MSI:
5373 case SKD_IRQ_MSIX:
5374 break;
5375 default:
5376 pr_err(PFX "skd_isr_type %d invalid, re-set to %d\n",
5377 skd_isr_type, SKD_IRQ_DEFAULT);
5378 skd_isr_type = SKD_IRQ_DEFAULT;
5379 }
5380
5381 if (skd_max_queue_depth < 1 ||
5382 skd_max_queue_depth > SKD_MAX_QUEUE_DEPTH) {
5383 pr_err(PFX "skd_max_queue_depth %d invalid, re-set to %d\n",
5384 skd_max_queue_depth, SKD_MAX_QUEUE_DEPTH_DEFAULT);
5385 skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT;
5386 }
5387
5388 if (skd_max_req_per_msg < 1 || skd_max_req_per_msg > 14) {
5389 pr_err(PFX "skd_max_req_per_msg %d invalid, re-set to %d\n",
5390 skd_max_req_per_msg, SKD_MAX_REQ_PER_MSG_DEFAULT);
5391 skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT;
5392 }
5393
5394 if (skd_sgs_per_request < 1 || skd_sgs_per_request > 4096) {
5395 pr_err(PFX "skd_sg_per_request %d invalid, re-set to %d\n",
5396 skd_sgs_per_request, SKD_N_SG_PER_REQ_DEFAULT);
5397 skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT;
5398 }
5399
5400 if (skd_dbg_level < 0 || skd_dbg_level > 2) {
5401 pr_err(PFX "skd_dbg_level %d invalid, re-set to %d\n",
5402 skd_dbg_level, 0);
5403 skd_dbg_level = 0;
5404 }
5405
5406 if (skd_isr_comp_limit < 0) {
5407 pr_err(PFX "skd_isr_comp_limit %d invalid, set to %d\n",
5408 skd_isr_comp_limit, 0);
5409 skd_isr_comp_limit = 0;
5410 }
5411
5412 if (skd_max_pass_thru < 1 || skd_max_pass_thru > 50) {
5413 pr_err(PFX "skd_max_pass_thru %d invalid, re-set to %d\n",
5414 skd_max_pass_thru, SKD_N_SPECIAL_CONTEXT);
5415 skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT;
5416 }
5417
5418 return pci_register_driver(&skd_driver);
5419}
5420
5421static void __exit skd_exit(void)
5422{
5423 pr_info(PFX " v%s-b%s unloading\n", DRV_VERSION, DRV_BUILD_ID);
5424
5425 pci_unregister_driver(&skd_driver);
5426
5427 if (skd_major)
5428 unregister_blkdev(skd_major, DRV_NAME);
5429}
5430
5431module_init(skd_init);
5432module_exit(skd_exit);
diff --git a/drivers/block/skd_s1120.h b/drivers/block/skd_s1120.h
new file mode 100644
index 000000000000..61c757ff0161
--- /dev/null
+++ b/drivers/block/skd_s1120.h
@@ -0,0 +1,330 @@
1/* Copyright 2012 STEC, Inc.
2 *
3 * This file is licensed under the terms of the 3-clause
4 * BSD License (http://opensource.org/licenses/BSD-3-Clause)
5 * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html),
6 * at your option. Both licenses are also available in the LICENSE file
7 * distributed with this project. This file may not be copied, modified,
8 * or distributed except in accordance with those terms.
9 */
10
11
12#ifndef SKD_S1120_H
13#define SKD_S1120_H
14
15#pragma pack(push, s1120_h, 1)
16
17/*
18 * Q-channel, 64-bit r/w
19 */
20#define FIT_Q_COMMAND 0x400u
21#define FIT_QCMD_QID_MASK (0x3 << 1)
22#define FIT_QCMD_QID0 (0x0 << 1)
23#define FIT_QCMD_QID_NORMAL FIT_QCMD_QID0
24#define FIT_QCMD_QID1 (0x1 << 1)
25#define FIT_QCMD_QID2 (0x2 << 1)
26#define FIT_QCMD_QID3 (0x3 << 1)
27#define FIT_QCMD_FLUSH_QUEUE (0ull) /* add QID */
28#define FIT_QCMD_MSGSIZE_MASK (0x3 << 4)
29#define FIT_QCMD_MSGSIZE_64 (0x0 << 4)
30#define FIT_QCMD_MSGSIZE_128 (0x1 << 4)
31#define FIT_QCMD_MSGSIZE_256 (0x2 << 4)
32#define FIT_QCMD_MSGSIZE_512 (0x3 << 4)
33#define FIT_QCMD_BASE_ADDRESS_MASK (0xFFFFFFFFFFFFFFC0ull)
34
35/*
36 * Control, 32-bit r/w
37 */
38#define FIT_CONTROL 0x500u
39#define FIT_CR_HARD_RESET (1u << 0u)
40#define FIT_CR_SOFT_RESET (1u << 1u)
41#define FIT_CR_DIS_TIMESTAMPS (1u << 6u)
42#define FIT_CR_ENABLE_INTERRUPTS (1u << 7u)
43
44/*
45 * Status, 32-bit, r/o
46 */
47#define FIT_STATUS 0x510u
48#define FIT_SR_DRIVE_STATE_MASK 0x000000FFu
49#define FIT_SR_SIGNATURE (0xFF << 8)
50#define FIT_SR_PIO_DMA (1 << 16)
51#define FIT_SR_DRIVE_OFFLINE 0x00
52#define FIT_SR_DRIVE_INIT 0x01
53/* #define FIT_SR_DRIVE_READY 0x02 */
54#define FIT_SR_DRIVE_ONLINE 0x03
55#define FIT_SR_DRIVE_BUSY 0x04
56#define FIT_SR_DRIVE_FAULT 0x05
57#define FIT_SR_DRIVE_DEGRADED 0x06
58#define FIT_SR_PCIE_LINK_DOWN 0x07
59#define FIT_SR_DRIVE_SOFT_RESET 0x08
60#define FIT_SR_DRIVE_INIT_FAULT 0x09
61#define FIT_SR_DRIVE_BUSY_SANITIZE 0x0A
62#define FIT_SR_DRIVE_BUSY_ERASE 0x0B
63#define FIT_SR_DRIVE_FW_BOOTING 0x0C
64#define FIT_SR_DRIVE_NEED_FW_DOWNLOAD 0xFE
65#define FIT_SR_DEVICE_MISSING 0xFF
66#define FIT_SR__RESERVED 0xFFFFFF00u
67
68/*
69 * FIT_STATUS - Status register data definition
70 */
71#define FIT_SR_STATE_MASK (0xFF << 0)
72#define FIT_SR_SIGNATURE (0xFF << 8)
73#define FIT_SR_PIO_DMA (1 << 16)
74
75/*
76 * Interrupt status, 32-bit r/w1c (w1c ==> write 1 to clear)
77 */
78#define FIT_INT_STATUS_HOST 0x520u
79#define FIT_ISH_FW_STATE_CHANGE (1u << 0u)
80#define FIT_ISH_COMPLETION_POSTED (1u << 1u)
81#define FIT_ISH_MSG_FROM_DEV (1u << 2u)
82#define FIT_ISH_UNDEFINED_3 (1u << 3u)
83#define FIT_ISH_UNDEFINED_4 (1u << 4u)
84#define FIT_ISH_Q0_FULL (1u << 5u)
85#define FIT_ISH_Q1_FULL (1u << 6u)
86#define FIT_ISH_Q2_FULL (1u << 7u)
87#define FIT_ISH_Q3_FULL (1u << 8u)
88#define FIT_ISH_QCMD_FIFO_OVERRUN (1u << 9u)
89#define FIT_ISH_BAD_EXP_ROM_READ (1u << 10u)
90
91#define FIT_INT_DEF_MASK \
92 (FIT_ISH_FW_STATE_CHANGE | \
93 FIT_ISH_COMPLETION_POSTED | \
94 FIT_ISH_MSG_FROM_DEV | \
95 FIT_ISH_Q0_FULL | \
96 FIT_ISH_Q1_FULL | \
97 FIT_ISH_Q2_FULL | \
98 FIT_ISH_Q3_FULL | \
99 FIT_ISH_QCMD_FIFO_OVERRUN | \
100 FIT_ISH_BAD_EXP_ROM_READ)
101
102#define FIT_INT_QUEUE_FULL \
103 (FIT_ISH_Q0_FULL | \
104 FIT_ISH_Q1_FULL | \
105 FIT_ISH_Q2_FULL | \
106 FIT_ISH_Q3_FULL)
107
108#define MSI_MSG_NWL_ERROR_0 0x00000000
109#define MSI_MSG_NWL_ERROR_1 0x00000001
110#define MSI_MSG_NWL_ERROR_2 0x00000002
111#define MSI_MSG_NWL_ERROR_3 0x00000003
112#define MSI_MSG_STATE_CHANGE 0x00000004
113#define MSI_MSG_COMPLETION_POSTED 0x00000005
114#define MSI_MSG_MSG_FROM_DEV 0x00000006
115#define MSI_MSG_RESERVED_0 0x00000007
116#define MSI_MSG_RESERVED_1 0x00000008
117#define MSI_MSG_QUEUE_0_FULL 0x00000009
118#define MSI_MSG_QUEUE_1_FULL 0x0000000A
119#define MSI_MSG_QUEUE_2_FULL 0x0000000B
120#define MSI_MSG_QUEUE_3_FULL 0x0000000C
121
122#define FIT_INT_RESERVED_MASK \
123 (FIT_ISH_UNDEFINED_3 | \
124 FIT_ISH_UNDEFINED_4)
125
126/*
127 * Interrupt mask, 32-bit r/w
128 * Bit definitions are the same as FIT_INT_STATUS_HOST
129 */
130#define FIT_INT_MASK_HOST 0x528u
131
132/*
133 * Message to device, 32-bit r/w
134 */
135#define FIT_MSG_TO_DEVICE 0x540u
136
137/*
138 * Message from device, 32-bit, r/o
139 */
140#define FIT_MSG_FROM_DEVICE 0x548u
141
142/*
143 * 32-bit messages to/from device, composition/extraction macros
144 */
145#define FIT_MXD_CONS(TYPE, PARAM, DATA) \
146 ((((TYPE) & 0xFFu) << 24u) | \
147 (((PARAM) & 0xFFu) << 16u) | \
148 (((DATA) & 0xFFFFu) << 0u))
149#define FIT_MXD_TYPE(MXD) (((MXD) >> 24u) & 0xFFu)
150#define FIT_MXD_PARAM(MXD) (((MXD) >> 16u) & 0xFFu)
151#define FIT_MXD_DATA(MXD) (((MXD) >> 0u) & 0xFFFFu)
152
153/*
154 * Types of messages to/from device
155 */
156#define FIT_MTD_FITFW_INIT 0x01u
157#define FIT_MTD_GET_CMDQ_DEPTH 0x02u
158#define FIT_MTD_SET_COMPQ_DEPTH 0x03u
159#define FIT_MTD_SET_COMPQ_ADDR 0x04u
160#define FIT_MTD_ARM_QUEUE 0x05u
161#define FIT_MTD_CMD_LOG_HOST_ID 0x07u
162#define FIT_MTD_CMD_LOG_TIME_STAMP_LO 0x08u
163#define FIT_MTD_CMD_LOG_TIME_STAMP_HI 0x09u
164#define FIT_MFD_SMART_EXCEEDED 0x10u
165#define FIT_MFD_POWER_DOWN 0x11u
166#define FIT_MFD_OFFLINE 0x12u
167#define FIT_MFD_ONLINE 0x13u
168#define FIT_MFD_FW_RESTARTING 0x14u
169#define FIT_MFD_PM_ACTIVE 0x15u
170#define FIT_MFD_PM_STANDBY 0x16u
171#define FIT_MFD_PM_SLEEP 0x17u
172#define FIT_MFD_CMD_PROGRESS 0x18u
173
174#define FIT_MTD_DEBUG 0xFEu
175#define FIT_MFD_DEBUG 0xFFu
176
177#define FIT_MFD_MASK (0xFFu)
178#define FIT_MFD_DATA_MASK (0xFFu)
179#define FIT_MFD_MSG(x) (((x) >> 24) & FIT_MFD_MASK)
180#define FIT_MFD_DATA(x) ((x) & FIT_MFD_MASK)
181
182/*
183 * Extra arg to FIT_MSG_TO_DEVICE, 64-bit r/w
184 * Used to set completion queue address (FIT_MTD_SET_COMPQ_ADDR)
185 * (was Response buffer in docs)
186 */
187#define FIT_MSG_TO_DEVICE_ARG 0x580u
188
189/*
190 * Hardware (ASIC) version, 32-bit r/o
191 */
192#define FIT_HW_VERSION 0x588u
193
194/*
195 * Scatter/gather list descriptor.
196 * 32-bytes and must be aligned on a 32-byte boundary.
197 * All fields are in little endian order.
198 */
199struct fit_sg_descriptor {
200 uint32_t control;
201 uint32_t byte_count;
202 uint64_t host_side_addr;
203 uint64_t dev_side_addr;
204 uint64_t next_desc_ptr;
205};
206
207#define FIT_SGD_CONTROL_NOT_LAST 0x000u
208#define FIT_SGD_CONTROL_LAST 0x40Eu
209
210/*
211 * Header at the beginning of a FIT message. The header
212 * is followed by SSDI requests each 64 bytes.
213 * A FIT message can be up to 512 bytes long and must start
214 * on a 64-byte boundary.
215 */
216struct fit_msg_hdr {
217 uint8_t protocol_id;
218 uint8_t num_protocol_cmds_coalesced;
219 uint8_t _reserved[62];
220};
221
222#define FIT_PROTOCOL_ID_FIT 1
223#define FIT_PROTOCOL_ID_SSDI 2
224#define FIT_PROTOCOL_ID_SOFIT 3
225
226
227#define FIT_PROTOCOL_MINOR_VER(mtd_val) ((mtd_val >> 16) & 0xF)
228#define FIT_PROTOCOL_MAJOR_VER(mtd_val) ((mtd_val >> 20) & 0xF)
229
230/*
231 * Format of a completion entry. The completion queue is circular
232 * and must have at least as many entries as the maximum number
233 * of commands that may be issued to the device.
234 *
235 * There are no head/tail pointers. The cycle value is used to
236 * infer the presence of new completion records.
237 * Initially the cycle in all entries is 0, the index is 0, and
238 * the cycle value to expect is 1. When completions are added
239 * their cycle values are set to 1. When the index wraps the
240 * cycle value to expect is incremented.
241 *
242 * Command_context is opaque and taken verbatim from the SSDI command.
243 * All other fields are big endian.
244 */
245#define FIT_PROTOCOL_VERSION_0 0
246
247/*
248 * Protocol major version 1 completion entry.
249 * The major protocol version is found in bits
250 * 20-23 of the FIT_MTD_FITFW_INIT response.
251 */
252struct fit_completion_entry_v1 {
253 uint32_t num_returned_bytes;
254 uint16_t tag;
255 uint8_t status; /* SCSI status */
256 uint8_t cycle;
257};
258#define FIT_PROTOCOL_VERSION_1 1
259#define FIT_PROTOCOL_VERSION_CURRENT FIT_PROTOCOL_VERSION_1
260
261struct fit_comp_error_info {
262 uint8_t type:7; /* 00: Bits0-6 indicates the type of sense data. */
263 uint8_t valid:1; /* 00: Bit 7 := 1 ==> info field is valid. */
264 uint8_t reserved0; /* 01: Obsolete field */
265 uint8_t key:4; /* 02: Bits0-3 indicate the sense key. */
266 uint8_t reserved2:1; /* 02: Reserved bit. */
267 uint8_t bad_length:1; /* 02: Incorrect Length Indicator */
268 uint8_t end_medium:1; /* 02: End of Medium */
269 uint8_t file_mark:1; /* 02: Filemark */
270 uint8_t info[4]; /* 03: */
271 uint8_t reserved1; /* 07: Additional Sense Length */
272 uint8_t cmd_spec[4]; /* 08: Command Specific Information */
273 uint8_t code; /* 0C: Additional Sense Code */
274 uint8_t qual; /* 0D: Additional Sense Code Qualifier */
275 uint8_t fruc; /* 0E: Field Replaceable Unit Code */
276 uint8_t sks_high:7; /* 0F: Sense Key Specific (MSB) */
277 uint8_t sks_valid:1; /* 0F: Sense Key Specific Valid */
278 uint16_t sks_low; /* 10: Sense Key Specific (LSW) */
279 uint16_t reserved3; /* 12: Part of additional sense bytes (unused) */
280 uint16_t uec; /* 14: Additional Sense Bytes */
281 uint64_t per; /* 16: Additional Sense Bytes */
282 uint8_t reserved4[2]; /* 1E: Additional Sense Bytes (unused) */
283};
284
285
286/* Task management constants */
287#define SOFT_TASK_SIMPLE 0x00
288#define SOFT_TASK_HEAD_OF_QUEUE 0x01
289#define SOFT_TASK_ORDERED 0x02
290
291/* Version zero has the last 32 bits reserved,
292 * Version one has the last 32 bits sg_list_len_bytes;
293 */
294struct skd_command_header {
295 uint64_t sg_list_dma_address;
296 uint16_t tag;
297 uint8_t attribute;
298 uint8_t add_cdb_len; /* In 32 bit words */
299 uint32_t sg_list_len_bytes;
300};
301
302struct skd_scsi_request {
303 struct skd_command_header hdr;
304 unsigned char cdb[16];
305/* unsigned char _reserved[16]; */
306};
307
308struct driver_inquiry_data {
309 uint8_t peripheral_device_type:5;
310 uint8_t qualifier:3;
311 uint8_t page_code;
312 uint16_t page_length;
313 uint16_t pcie_bus_number;
314 uint8_t pcie_device_number;
315 uint8_t pcie_function_number;
316 uint8_t pcie_link_speed;
317 uint8_t pcie_link_lanes;
318 uint16_t pcie_vendor_id;
319 uint16_t pcie_device_id;
320 uint16_t pcie_subsystem_vendor_id;
321 uint16_t pcie_subsystem_device_id;
322 uint8_t reserved1[2];
323 uint8_t reserved2[3];
324 uint8_t driver_version_length;
325 uint8_t driver_version[0x14];
326};
327
328#pragma pack(pop, s1120_h)
329
330#endif /* SKD_S1120_H */
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index bf4b9d282c04..6620b73d0490 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -887,6 +887,8 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
887 unsigned long secure; 887 unsigned long secure;
888 struct phys_req preq; 888 struct phys_req preq;
889 889
890 xen_blkif_get(blkif);
891
890 preq.sector_number = req->u.discard.sector_number; 892 preq.sector_number = req->u.discard.sector_number;
891 preq.nr_sects = req->u.discard.nr_sectors; 893 preq.nr_sects = req->u.discard.nr_sectors;
892 894
@@ -899,7 +901,6 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
899 } 901 }
900 blkif->st_ds_req++; 902 blkif->st_ds_req++;
901 903
902 xen_blkif_get(blkif);
903 secure = (blkif->vbd.discard_secure && 904 secure = (blkif->vbd.discard_secure &&
904 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? 905 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
905 BLKDEV_DISCARD_SECURE : 0; 906 BLKDEV_DISCARD_SECURE : 0;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 8d53ed293606..432db1b59b00 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -121,7 +121,8 @@ struct blkfront_info
121 struct work_struct work; 121 struct work_struct work;
122 struct gnttab_free_callback callback; 122 struct gnttab_free_callback callback;
123 struct blk_shadow shadow[BLK_RING_SIZE]; 123 struct blk_shadow shadow[BLK_RING_SIZE];
124 struct list_head persistent_gnts; 124 struct list_head grants;
125 struct list_head indirect_pages;
125 unsigned int persistent_gnts_c; 126 unsigned int persistent_gnts_c;
126 unsigned long shadow_free; 127 unsigned long shadow_free;
127 unsigned int feature_flush; 128 unsigned int feature_flush;
@@ -200,15 +201,17 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
200 if (!gnt_list_entry) 201 if (!gnt_list_entry)
201 goto out_of_memory; 202 goto out_of_memory;
202 203
203 granted_page = alloc_page(GFP_NOIO); 204 if (info->feature_persistent) {
204 if (!granted_page) { 205 granted_page = alloc_page(GFP_NOIO);
205 kfree(gnt_list_entry); 206 if (!granted_page) {
206 goto out_of_memory; 207 kfree(gnt_list_entry);
208 goto out_of_memory;
209 }
210 gnt_list_entry->pfn = page_to_pfn(granted_page);
207 } 211 }
208 212
209 gnt_list_entry->pfn = page_to_pfn(granted_page);
210 gnt_list_entry->gref = GRANT_INVALID_REF; 213 gnt_list_entry->gref = GRANT_INVALID_REF;
211 list_add(&gnt_list_entry->node, &info->persistent_gnts); 214 list_add(&gnt_list_entry->node, &info->grants);
212 i++; 215 i++;
213 } 216 }
214 217
@@ -216,9 +219,10 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
216 219
217out_of_memory: 220out_of_memory:
218 list_for_each_entry_safe(gnt_list_entry, n, 221 list_for_each_entry_safe(gnt_list_entry, n,
219 &info->persistent_gnts, node) { 222 &info->grants, node) {
220 list_del(&gnt_list_entry->node); 223 list_del(&gnt_list_entry->node);
221 __free_page(pfn_to_page(gnt_list_entry->pfn)); 224 if (info->feature_persistent)
225 __free_page(pfn_to_page(gnt_list_entry->pfn));
222 kfree(gnt_list_entry); 226 kfree(gnt_list_entry);
223 i--; 227 i--;
224 } 228 }
@@ -227,13 +231,14 @@ out_of_memory:
227} 231}
228 232
229static struct grant *get_grant(grant_ref_t *gref_head, 233static struct grant *get_grant(grant_ref_t *gref_head,
234 unsigned long pfn,
230 struct blkfront_info *info) 235 struct blkfront_info *info)
231{ 236{
232 struct grant *gnt_list_entry; 237 struct grant *gnt_list_entry;
233 unsigned long buffer_mfn; 238 unsigned long buffer_mfn;
234 239
235 BUG_ON(list_empty(&info->persistent_gnts)); 240 BUG_ON(list_empty(&info->grants));
236 gnt_list_entry = list_first_entry(&info->persistent_gnts, struct grant, 241 gnt_list_entry = list_first_entry(&info->grants, struct grant,
237 node); 242 node);
238 list_del(&gnt_list_entry->node); 243 list_del(&gnt_list_entry->node);
239 244
@@ -245,6 +250,10 @@ static struct grant *get_grant(grant_ref_t *gref_head,
245 /* Assign a gref to this page */ 250 /* Assign a gref to this page */
246 gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); 251 gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
247 BUG_ON(gnt_list_entry->gref == -ENOSPC); 252 BUG_ON(gnt_list_entry->gref == -ENOSPC);
253 if (!info->feature_persistent) {
254 BUG_ON(!pfn);
255 gnt_list_entry->pfn = pfn;
256 }
248 buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); 257 buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn);
249 gnttab_grant_foreign_access_ref(gnt_list_entry->gref, 258 gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
250 info->xbdev->otherend_id, 259 info->xbdev->otherend_id,
@@ -400,10 +409,13 @@ static int blkif_queue_request(struct request *req)
400 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) 409 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
401 return 1; 410 return 1;
402 411
403 max_grefs = info->max_indirect_segments ? 412 max_grefs = req->nr_phys_segments;
404 info->max_indirect_segments + 413 if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
405 INDIRECT_GREFS(info->max_indirect_segments) : 414 /*
406 BLKIF_MAX_SEGMENTS_PER_REQUEST; 415 * If we are using indirect segments we need to account
416 * for the indirect grefs used in the request.
417 */
418 max_grefs += INDIRECT_GREFS(req->nr_phys_segments);
407 419
408 /* Check if we have enough grants to allocate a requests */ 420 /* Check if we have enough grants to allocate a requests */
409 if (info->persistent_gnts_c < max_grefs) { 421 if (info->persistent_gnts_c < max_grefs) {
@@ -477,22 +489,34 @@ static int blkif_queue_request(struct request *req)
477 489
478 if ((ring_req->operation == BLKIF_OP_INDIRECT) && 490 if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
479 (i % SEGS_PER_INDIRECT_FRAME == 0)) { 491 (i % SEGS_PER_INDIRECT_FRAME == 0)) {
492 unsigned long pfn;
493
480 if (segments) 494 if (segments)
481 kunmap_atomic(segments); 495 kunmap_atomic(segments);
482 496
483 n = i / SEGS_PER_INDIRECT_FRAME; 497 n = i / SEGS_PER_INDIRECT_FRAME;
484 gnt_list_entry = get_grant(&gref_head, info); 498 if (!info->feature_persistent) {
499 struct page *indirect_page;
500
501 /* Fetch a pre-allocated page to use for indirect grefs */
502 BUG_ON(list_empty(&info->indirect_pages));
503 indirect_page = list_first_entry(&info->indirect_pages,
504 struct page, lru);
505 list_del(&indirect_page->lru);
506 pfn = page_to_pfn(indirect_page);
507 }
508 gnt_list_entry = get_grant(&gref_head, pfn, info);
485 info->shadow[id].indirect_grants[n] = gnt_list_entry; 509 info->shadow[id].indirect_grants[n] = gnt_list_entry;
486 segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); 510 segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
487 ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; 511 ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
488 } 512 }
489 513
490 gnt_list_entry = get_grant(&gref_head, info); 514 gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info);
491 ref = gnt_list_entry->gref; 515 ref = gnt_list_entry->gref;
492 516
493 info->shadow[id].grants_used[i] = gnt_list_entry; 517 info->shadow[id].grants_used[i] = gnt_list_entry;
494 518
495 if (rq_data_dir(req)) { 519 if (rq_data_dir(req) && info->feature_persistent) {
496 char *bvec_data; 520 char *bvec_data;
497 void *shared_data; 521 void *shared_data;
498 522
@@ -904,21 +928,36 @@ static void blkif_free(struct blkfront_info *info, int suspend)
904 blk_stop_queue(info->rq); 928 blk_stop_queue(info->rq);
905 929
906 /* Remove all persistent grants */ 930 /* Remove all persistent grants */
907 if (!list_empty(&info->persistent_gnts)) { 931 if (!list_empty(&info->grants)) {
908 list_for_each_entry_safe(persistent_gnt, n, 932 list_for_each_entry_safe(persistent_gnt, n,
909 &info->persistent_gnts, node) { 933 &info->grants, node) {
910 list_del(&persistent_gnt->node); 934 list_del(&persistent_gnt->node);
911 if (persistent_gnt->gref != GRANT_INVALID_REF) { 935 if (persistent_gnt->gref != GRANT_INVALID_REF) {
912 gnttab_end_foreign_access(persistent_gnt->gref, 936 gnttab_end_foreign_access(persistent_gnt->gref,
913 0, 0UL); 937 0, 0UL);
914 info->persistent_gnts_c--; 938 info->persistent_gnts_c--;
915 } 939 }
916 __free_page(pfn_to_page(persistent_gnt->pfn)); 940 if (info->feature_persistent)
941 __free_page(pfn_to_page(persistent_gnt->pfn));
917 kfree(persistent_gnt); 942 kfree(persistent_gnt);
918 } 943 }
919 } 944 }
920 BUG_ON(info->persistent_gnts_c != 0); 945 BUG_ON(info->persistent_gnts_c != 0);
921 946
947 /*
948 * Remove indirect pages, this only happens when using indirect
949 * descriptors but not persistent grants
950 */
951 if (!list_empty(&info->indirect_pages)) {
952 struct page *indirect_page, *n;
953
954 BUG_ON(info->feature_persistent);
955 list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
956 list_del(&indirect_page->lru);
957 __free_page(indirect_page);
958 }
959 }
960
922 for (i = 0; i < BLK_RING_SIZE; i++) { 961 for (i = 0; i < BLK_RING_SIZE; i++) {
923 /* 962 /*
924 * Clear persistent grants present in requests already 963 * Clear persistent grants present in requests already
@@ -933,7 +972,8 @@ static void blkif_free(struct blkfront_info *info, int suspend)
933 for (j = 0; j < segs; j++) { 972 for (j = 0; j < segs; j++) {
934 persistent_gnt = info->shadow[i].grants_used[j]; 973 persistent_gnt = info->shadow[i].grants_used[j];
935 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); 974 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
936 __free_page(pfn_to_page(persistent_gnt->pfn)); 975 if (info->feature_persistent)
976 __free_page(pfn_to_page(persistent_gnt->pfn));
937 kfree(persistent_gnt); 977 kfree(persistent_gnt);
938 } 978 }
939 979
@@ -992,7 +1032,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
992 nseg = s->req.operation == BLKIF_OP_INDIRECT ? 1032 nseg = s->req.operation == BLKIF_OP_INDIRECT ?
993 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; 1033 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
994 1034
995 if (bret->operation == BLKIF_OP_READ) { 1035 if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
996 /* 1036 /*
997 * Copy the data received from the backend into the bvec. 1037 * Copy the data received from the backend into the bvec.
998 * Since bv_offset can be different than 0, and bv_len different 1038 * Since bv_offset can be different than 0, and bv_len different
@@ -1013,13 +1053,51 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
1013 } 1053 }
1014 /* Add the persistent grant into the list of free grants */ 1054 /* Add the persistent grant into the list of free grants */
1015 for (i = 0; i < nseg; i++) { 1055 for (i = 0; i < nseg; i++) {
1016 list_add(&s->grants_used[i]->node, &info->persistent_gnts); 1056 if (gnttab_query_foreign_access(s->grants_used[i]->gref)) {
1017 info->persistent_gnts_c++; 1057 /*
1058 * If the grant is still mapped by the backend (the
1059 * backend has chosen to make this grant persistent)
1060 * we add it at the head of the list, so it will be
1061 * reused first.
1062 */
1063 if (!info->feature_persistent)
1064 pr_alert_ratelimited("backed has not unmapped grant: %u\n",
1065 s->grants_used[i]->gref);
1066 list_add(&s->grants_used[i]->node, &info->grants);
1067 info->persistent_gnts_c++;
1068 } else {
1069 /*
1070 * If the grant is not mapped by the backend we end the
1071 * foreign access and add it to the tail of the list,
1072 * so it will not be picked again unless we run out of
1073 * persistent grants.
1074 */
1075 gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
1076 s->grants_used[i]->gref = GRANT_INVALID_REF;
1077 list_add_tail(&s->grants_used[i]->node, &info->grants);
1078 }
1018 } 1079 }
1019 if (s->req.operation == BLKIF_OP_INDIRECT) { 1080 if (s->req.operation == BLKIF_OP_INDIRECT) {
1020 for (i = 0; i < INDIRECT_GREFS(nseg); i++) { 1081 for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
1021 list_add(&s->indirect_grants[i]->node, &info->persistent_gnts); 1082 if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) {
1022 info->persistent_gnts_c++; 1083 if (!info->feature_persistent)
1084 pr_alert_ratelimited("backed has not unmapped grant: %u\n",
1085 s->indirect_grants[i]->gref);
1086 list_add(&s->indirect_grants[i]->node, &info->grants);
1087 info->persistent_gnts_c++;
1088 } else {
1089 struct page *indirect_page;
1090
1091 gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL);
1092 /*
1093 * Add the used indirect page back to the list of
1094 * available pages for indirect grefs.
1095 */
1096 indirect_page = pfn_to_page(s->indirect_grants[i]->pfn);
1097 list_add(&indirect_page->lru, &info->indirect_pages);
1098 s->indirect_grants[i]->gref = GRANT_INVALID_REF;
1099 list_add_tail(&s->indirect_grants[i]->node, &info->grants);
1100 }
1023 } 1101 }
1024 } 1102 }
1025} 1103}
@@ -1313,7 +1391,8 @@ static int blkfront_probe(struct xenbus_device *dev,
1313 spin_lock_init(&info->io_lock); 1391 spin_lock_init(&info->io_lock);
1314 info->xbdev = dev; 1392 info->xbdev = dev;
1315 info->vdevice = vdevice; 1393 info->vdevice = vdevice;
1316 INIT_LIST_HEAD(&info->persistent_gnts); 1394 INIT_LIST_HEAD(&info->grants);
1395 INIT_LIST_HEAD(&info->indirect_pages);
1317 info->persistent_gnts_c = 0; 1396 info->persistent_gnts_c = 0;
1318 info->connected = BLKIF_STATE_DISCONNECTED; 1397 info->connected = BLKIF_STATE_DISCONNECTED;
1319 INIT_WORK(&info->work, blkif_restart_queue); 1398 INIT_WORK(&info->work, blkif_restart_queue);
@@ -1609,6 +1688,23 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
1609 if (err) 1688 if (err)
1610 goto out_of_memory; 1689 goto out_of_memory;
1611 1690
1691 if (!info->feature_persistent && info->max_indirect_segments) {
1692 /*
1693 * We are using indirect descriptors but not persistent
1694 * grants, we need to allocate a set of pages that can be
1695 * used for mapping indirect grefs
1696 */
1697 int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE;
1698
1699 BUG_ON(!list_empty(&info->indirect_pages));
1700 for (i = 0; i < num; i++) {
1701 struct page *indirect_page = alloc_page(GFP_NOIO);
1702 if (!indirect_page)
1703 goto out_of_memory;
1704 list_add(&indirect_page->lru, &info->indirect_pages);
1705 }
1706 }
1707
1612 for (i = 0; i < BLK_RING_SIZE; i++) { 1708 for (i = 0; i < BLK_RING_SIZE; i++) {
1613 info->shadow[i].grants_used = kzalloc( 1709 info->shadow[i].grants_used = kzalloc(
1614 sizeof(info->shadow[i].grants_used[0]) * segs, 1710 sizeof(info->shadow[i].grants_used[0]) * segs,
@@ -1639,6 +1735,13 @@ out_of_memory:
1639 kfree(info->shadow[i].indirect_grants); 1735 kfree(info->shadow[i].indirect_grants);
1640 info->shadow[i].indirect_grants = NULL; 1736 info->shadow[i].indirect_grants = NULL;
1641 } 1737 }
1738 if (!list_empty(&info->indirect_pages)) {
1739 struct page *indirect_page, *n;
1740 list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
1741 list_del(&indirect_page->lru);
1742 __free_page(indirect_page);
1743 }
1744 }
1642 return -ENOMEM; 1745 return -ENOMEM;
1643} 1746}
1644 1747
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index f950c9d29f3e..2638417b19aa 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -13,15 +13,8 @@ config BCACHE_DEBUG
13 ---help--- 13 ---help---
14 Don't select this option unless you're a developer 14 Don't select this option unless you're a developer
15 15
16 Enables extra debugging tools (primarily a fuzz tester) 16 Enables extra debugging tools, allows expensive runtime checks to be
17 17 turned on.
18config BCACHE_EDEBUG
19 bool "Extended runtime checks"
20 depends on BCACHE
21 ---help---
22 Don't select this option unless you're a developer
23
24 Enables extra runtime checks which significantly affect performance
25 18
26config BCACHE_CLOSURES_DEBUG 19config BCACHE_CLOSURES_DEBUG
27 bool "Debug closures" 20 bool "Debug closures"
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index e45f5575fd4d..2b46bf1d7e40 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -63,13 +63,12 @@
63#include "bcache.h" 63#include "bcache.h"
64#include "btree.h" 64#include "btree.h"
65 65
66#include <linux/blkdev.h>
66#include <linux/freezer.h> 67#include <linux/freezer.h>
67#include <linux/kthread.h> 68#include <linux/kthread.h>
68#include <linux/random.h> 69#include <linux/random.h>
69#include <trace/events/bcache.h> 70#include <trace/events/bcache.h>
70 71
71#define MAX_IN_FLIGHT_DISCARDS 8U
72
73/* Bucket heap / gen */ 72/* Bucket heap / gen */
74 73
75uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) 74uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
@@ -121,75 +120,6 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
121 mutex_unlock(&c->bucket_lock); 120 mutex_unlock(&c->bucket_lock);
122} 121}
123 122
124/* Discard/TRIM */
125
126struct discard {
127 struct list_head list;
128 struct work_struct work;
129 struct cache *ca;
130 long bucket;
131
132 struct bio bio;
133 struct bio_vec bv;
134};
135
136static void discard_finish(struct work_struct *w)
137{
138 struct discard *d = container_of(w, struct discard, work);
139 struct cache *ca = d->ca;
140 char buf[BDEVNAME_SIZE];
141
142 if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) {
143 pr_notice("discard error on %s, disabling",
144 bdevname(ca->bdev, buf));
145 d->ca->discard = 0;
146 }
147
148 mutex_lock(&ca->set->bucket_lock);
149
150 fifo_push(&ca->free, d->bucket);
151 list_add(&d->list, &ca->discards);
152 atomic_dec(&ca->discards_in_flight);
153
154 mutex_unlock(&ca->set->bucket_lock);
155
156 closure_wake_up(&ca->set->bucket_wait);
157 wake_up_process(ca->alloc_thread);
158
159 closure_put(&ca->set->cl);
160}
161
162static void discard_endio(struct bio *bio, int error)
163{
164 struct discard *d = container_of(bio, struct discard, bio);
165 schedule_work(&d->work);
166}
167
168static void do_discard(struct cache *ca, long bucket)
169{
170 struct discard *d = list_first_entry(&ca->discards,
171 struct discard, list);
172
173 list_del(&d->list);
174 d->bucket = bucket;
175
176 atomic_inc(&ca->discards_in_flight);
177 closure_get(&ca->set->cl);
178
179 bio_init(&d->bio);
180
181 d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket);
182 d->bio.bi_bdev = ca->bdev;
183 d->bio.bi_rw = REQ_WRITE|REQ_DISCARD;
184 d->bio.bi_max_vecs = 1;
185 d->bio.bi_io_vec = d->bio.bi_inline_vecs;
186 d->bio.bi_size = bucket_bytes(ca);
187 d->bio.bi_end_io = discard_endio;
188 bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
189
190 submit_bio(0, &d->bio);
191}
192
193/* Allocation */ 123/* Allocation */
194 124
195static inline bool can_inc_bucket_gen(struct bucket *b) 125static inline bool can_inc_bucket_gen(struct bucket *b)
@@ -280,7 +210,7 @@ static void invalidate_buckets_lru(struct cache *ca)
280 * multiple times when it can't do anything 210 * multiple times when it can't do anything
281 */ 211 */
282 ca->invalidate_needs_gc = 1; 212 ca->invalidate_needs_gc = 1;
283 bch_queue_gc(ca->set); 213 wake_up_gc(ca->set);
284 return; 214 return;
285 } 215 }
286 216
@@ -305,7 +235,7 @@ static void invalidate_buckets_fifo(struct cache *ca)
305 235
306 if (++checked >= ca->sb.nbuckets) { 236 if (++checked >= ca->sb.nbuckets) {
307 ca->invalidate_needs_gc = 1; 237 ca->invalidate_needs_gc = 1;
308 bch_queue_gc(ca->set); 238 wake_up_gc(ca->set);
309 return; 239 return;
310 } 240 }
311 } 241 }
@@ -330,7 +260,7 @@ static void invalidate_buckets_random(struct cache *ca)
330 260
331 if (++checked >= ca->sb.nbuckets / 2) { 261 if (++checked >= ca->sb.nbuckets / 2) {
332 ca->invalidate_needs_gc = 1; 262 ca->invalidate_needs_gc = 1;
333 bch_queue_gc(ca->set); 263 wake_up_gc(ca->set);
334 return; 264 return;
335 } 265 }
336 } 266 }
@@ -398,16 +328,18 @@ static int bch_allocator_thread(void *arg)
398 else 328 else
399 break; 329 break;
400 330
401 allocator_wait(ca, (int) fifo_free(&ca->free) >
402 atomic_read(&ca->discards_in_flight));
403
404 if (ca->discard) { 331 if (ca->discard) {
405 allocator_wait(ca, !list_empty(&ca->discards)); 332 mutex_unlock(&ca->set->bucket_lock);
406 do_discard(ca, bucket); 333 blkdev_issue_discard(ca->bdev,
407 } else { 334 bucket_to_sector(ca->set, bucket),
408 fifo_push(&ca->free, bucket); 335 ca->sb.block_size, GFP_KERNEL, 0);
409 closure_wake_up(&ca->set->bucket_wait); 336 mutex_lock(&ca->set->bucket_lock);
410 } 337 }
338
339 allocator_wait(ca, !fifo_full(&ca->free));
340
341 fifo_push(&ca->free, bucket);
342 wake_up(&ca->set->bucket_wait);
411 } 343 }
412 344
413 /* 345 /*
@@ -433,16 +365,40 @@ static int bch_allocator_thread(void *arg)
433 } 365 }
434} 366}
435 367
436long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) 368long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
437{ 369{
438 long r = -1; 370 DEFINE_WAIT(w);
439again: 371 struct bucket *b;
372 long r;
373
374 /* fastpath */
375 if (fifo_used(&ca->free) > ca->watermark[watermark]) {
376 fifo_pop(&ca->free, r);
377 goto out;
378 }
379
380 if (!wait)
381 return -1;
382
383 while (1) {
384 if (fifo_used(&ca->free) > ca->watermark[watermark]) {
385 fifo_pop(&ca->free, r);
386 break;
387 }
388
389 prepare_to_wait(&ca->set->bucket_wait, &w,
390 TASK_UNINTERRUPTIBLE);
391
392 mutex_unlock(&ca->set->bucket_lock);
393 schedule();
394 mutex_lock(&ca->set->bucket_lock);
395 }
396
397 finish_wait(&ca->set->bucket_wait, &w);
398out:
440 wake_up_process(ca->alloc_thread); 399 wake_up_process(ca->alloc_thread);
441 400
442 if (fifo_used(&ca->free) > ca->watermark[watermark] && 401 if (expensive_debug_checks(ca->set)) {
443 fifo_pop(&ca->free, r)) {
444 struct bucket *b = ca->buckets + r;
445#ifdef CONFIG_BCACHE_EDEBUG
446 size_t iter; 402 size_t iter;
447 long i; 403 long i;
448 404
@@ -455,36 +411,23 @@ again:
455 BUG_ON(i == r); 411 BUG_ON(i == r);
456 fifo_for_each(i, &ca->unused, iter) 412 fifo_for_each(i, &ca->unused, iter)
457 BUG_ON(i == r); 413 BUG_ON(i == r);
458#endif
459 BUG_ON(atomic_read(&b->pin) != 1);
460
461 SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
462
463 if (watermark <= WATERMARK_METADATA) {
464 SET_GC_MARK(b, GC_MARK_METADATA);
465 b->prio = BTREE_PRIO;
466 } else {
467 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
468 b->prio = INITIAL_PRIO;
469 }
470
471 return r;
472 } 414 }
473 415
474 trace_bcache_alloc_fail(ca); 416 b = ca->buckets + r;
475 417
476 if (cl) { 418 BUG_ON(atomic_read(&b->pin) != 1);
477 closure_wait(&ca->set->bucket_wait, cl);
478 419
479 if (closure_blocking(cl)) { 420 SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
480 mutex_unlock(&ca->set->bucket_lock); 421
481 closure_sync(cl); 422 if (watermark <= WATERMARK_METADATA) {
482 mutex_lock(&ca->set->bucket_lock); 423 SET_GC_MARK(b, GC_MARK_METADATA);
483 goto again; 424 b->prio = BTREE_PRIO;
484 } 425 } else {
426 SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
427 b->prio = INITIAL_PRIO;
485 } 428 }
486 429
487 return -1; 430 return r;
488} 431}
489 432
490void bch_bucket_free(struct cache_set *c, struct bkey *k) 433void bch_bucket_free(struct cache_set *c, struct bkey *k)
@@ -501,7 +444,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
501} 444}
502 445
503int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, 446int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
504 struct bkey *k, int n, struct closure *cl) 447 struct bkey *k, int n, bool wait)
505{ 448{
506 int i; 449 int i;
507 450
@@ -514,7 +457,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
514 457
515 for (i = 0; i < n; i++) { 458 for (i = 0; i < n; i++) {
516 struct cache *ca = c->cache_by_alloc[i]; 459 struct cache *ca = c->cache_by_alloc[i];
517 long b = bch_bucket_alloc(ca, watermark, cl); 460 long b = bch_bucket_alloc(ca, watermark, wait);
518 461
519 if (b == -1) 462 if (b == -1)
520 goto err; 463 goto err;
@@ -529,22 +472,202 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
529 return 0; 472 return 0;
530err: 473err:
531 bch_bucket_free(c, k); 474 bch_bucket_free(c, k);
532 __bkey_put(c, k); 475 bkey_put(c, k);
533 return -1; 476 return -1;
534} 477}
535 478
536int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, 479int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
537 struct bkey *k, int n, struct closure *cl) 480 struct bkey *k, int n, bool wait)
538{ 481{
539 int ret; 482 int ret;
540 mutex_lock(&c->bucket_lock); 483 mutex_lock(&c->bucket_lock);
541 ret = __bch_bucket_alloc_set(c, watermark, k, n, cl); 484 ret = __bch_bucket_alloc_set(c, watermark, k, n, wait);
542 mutex_unlock(&c->bucket_lock); 485 mutex_unlock(&c->bucket_lock);
543 return ret; 486 return ret;
544} 487}
545 488
489/* Sector allocator */
490
491struct open_bucket {
492 struct list_head list;
493 unsigned last_write_point;
494 unsigned sectors_free;
495 BKEY_PADDED(key);
496};
497
498/*
499 * We keep multiple buckets open for writes, and try to segregate different
500 * write streams for better cache utilization: first we look for a bucket where
501 * the last write to it was sequential with the current write, and failing that
502 * we look for a bucket that was last used by the same task.
503 *
504 * The ideas is if you've got multiple tasks pulling data into the cache at the
505 * same time, you'll get better cache utilization if you try to segregate their
506 * data and preserve locality.
507 *
508 * For example, say you've starting Firefox at the same time you're copying a
509 * bunch of files. Firefox will likely end up being fairly hot and stay in the
510 * cache awhile, but the data you copied might not be; if you wrote all that
511 * data to the same buckets it'd get invalidated at the same time.
512 *
513 * Both of those tasks will be doing fairly random IO so we can't rely on
514 * detecting sequential IO to segregate their data, but going off of the task
515 * should be a sane heuristic.
516 */
517static struct open_bucket *pick_data_bucket(struct cache_set *c,
518 const struct bkey *search,
519 unsigned write_point,
520 struct bkey *alloc)
521{
522 struct open_bucket *ret, *ret_task = NULL;
523
524 list_for_each_entry_reverse(ret, &c->data_buckets, list)
525 if (!bkey_cmp(&ret->key, search))
526 goto found;
527 else if (ret->last_write_point == write_point)
528 ret_task = ret;
529
530 ret = ret_task ?: list_first_entry(&c->data_buckets,
531 struct open_bucket, list);
532found:
533 if (!ret->sectors_free && KEY_PTRS(alloc)) {
534 ret->sectors_free = c->sb.bucket_size;
535 bkey_copy(&ret->key, alloc);
536 bkey_init(alloc);
537 }
538
539 if (!ret->sectors_free)
540 ret = NULL;
541
542 return ret;
543}
544
545/*
546 * Allocates some space in the cache to write to, and k to point to the newly
547 * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
548 * end of the newly allocated space).
549 *
550 * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
551 * sectors were actually allocated.
552 *
553 * If s->writeback is true, will not fail.
554 */
555bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
556 unsigned write_point, unsigned write_prio, bool wait)
557{
558 struct open_bucket *b;
559 BKEY_PADDED(key) alloc;
560 unsigned i;
561
562 /*
563 * We might have to allocate a new bucket, which we can't do with a
564 * spinlock held. So if we have to allocate, we drop the lock, allocate
565 * and then retry. KEY_PTRS() indicates whether alloc points to
566 * allocated bucket(s).
567 */
568
569 bkey_init(&alloc.key);
570 spin_lock(&c->data_bucket_lock);
571
572 while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
573 unsigned watermark = write_prio
574 ? WATERMARK_MOVINGGC
575 : WATERMARK_NONE;
576
577 spin_unlock(&c->data_bucket_lock);
578
579 if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait))
580 return false;
581
582 spin_lock(&c->data_bucket_lock);
583 }
584
585 /*
586 * If we had to allocate, we might race and not need to allocate the
587 * second time we call find_data_bucket(). If we allocated a bucket but
588 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
589 */
590 if (KEY_PTRS(&alloc.key))
591 bkey_put(c, &alloc.key);
592
593 for (i = 0; i < KEY_PTRS(&b->key); i++)
594 EBUG_ON(ptr_stale(c, &b->key, i));
595
596 /* Set up the pointer to the space we're allocating: */
597
598 for (i = 0; i < KEY_PTRS(&b->key); i++)
599 k->ptr[i] = b->key.ptr[i];
600
601 sectors = min(sectors, b->sectors_free);
602
603 SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
604 SET_KEY_SIZE(k, sectors);
605 SET_KEY_PTRS(k, KEY_PTRS(&b->key));
606
607 /*
608 * Move b to the end of the lru, and keep track of what this bucket was
609 * last used for:
610 */
611 list_move_tail(&b->list, &c->data_buckets);
612 bkey_copy_key(&b->key, k);
613 b->last_write_point = write_point;
614
615 b->sectors_free -= sectors;
616
617 for (i = 0; i < KEY_PTRS(&b->key); i++) {
618 SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
619
620 atomic_long_add(sectors,
621 &PTR_CACHE(c, &b->key, i)->sectors_written);
622 }
623
624 if (b->sectors_free < c->sb.block_size)
625 b->sectors_free = 0;
626
627 /*
628 * k takes refcounts on the buckets it points to until it's inserted
629 * into the btree, but if we're done with this bucket we just transfer
630 * get_data_bucket()'s refcount.
631 */
632 if (b->sectors_free)
633 for (i = 0; i < KEY_PTRS(&b->key); i++)
634 atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
635
636 spin_unlock(&c->data_bucket_lock);
637 return true;
638}
639
546/* Init */ 640/* Init */
547 641
642void bch_open_buckets_free(struct cache_set *c)
643{
644 struct open_bucket *b;
645
646 while (!list_empty(&c->data_buckets)) {
647 b = list_first_entry(&c->data_buckets,
648 struct open_bucket, list);
649 list_del(&b->list);
650 kfree(b);
651 }
652}
653
654int bch_open_buckets_alloc(struct cache_set *c)
655{
656 int i;
657
658 spin_lock_init(&c->data_bucket_lock);
659
660 for (i = 0; i < 6; i++) {
661 struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
662 if (!b)
663 return -ENOMEM;
664
665 list_add(&b->list, &c->data_buckets);
666 }
667
668 return 0;
669}
670
548int bch_cache_allocator_start(struct cache *ca) 671int bch_cache_allocator_start(struct cache *ca)
549{ 672{
550 struct task_struct *k = kthread_run(bch_allocator_thread, 673 struct task_struct *k = kthread_run(bch_allocator_thread,
@@ -556,22 +679,8 @@ int bch_cache_allocator_start(struct cache *ca)
556 return 0; 679 return 0;
557} 680}
558 681
559void bch_cache_allocator_exit(struct cache *ca)
560{
561 struct discard *d;
562
563 while (!list_empty(&ca->discards)) {
564 d = list_first_entry(&ca->discards, struct discard, list);
565 cancel_work_sync(&d->work);
566 list_del(&d->list);
567 kfree(d);
568 }
569}
570
571int bch_cache_allocator_init(struct cache *ca) 682int bch_cache_allocator_init(struct cache *ca)
572{ 683{
573 unsigned i;
574
575 /* 684 /*
576 * Reserve: 685 * Reserve:
577 * Prio/gen writes first 686 * Prio/gen writes first
@@ -589,15 +698,5 @@ int bch_cache_allocator_init(struct cache *ca)
589 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + 698 ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
590 ca->watermark[WATERMARK_MOVINGGC]; 699 ca->watermark[WATERMARK_MOVINGGC];
591 700
592 for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) {
593 struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL);
594 if (!d)
595 return -ENOMEM;
596
597 d->ca = ca;
598 INIT_WORK(&d->work, discard_finish);
599 list_add(&d->list, &ca->discards);
600 }
601
602 return 0; 701 return 0;
603} 702}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 0f12382aa35d..4beb55a0ff30 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -177,6 +177,7 @@
177 177
178#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ 178#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
179 179
180#include <linux/bcache.h>
180#include <linux/bio.h> 181#include <linux/bio.h>
181#include <linux/kobject.h> 182#include <linux/kobject.h>
182#include <linux/list.h> 183#include <linux/list.h>
@@ -210,168 +211,6 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
210#define GC_MARK_METADATA 2 211#define GC_MARK_METADATA 2
211BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); 212BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14);
212 213
213struct bkey {
214 uint64_t high;
215 uint64_t low;
216 uint64_t ptr[];
217};
218
219/* Enough for a key with 6 pointers */
220#define BKEY_PAD 8
221
222#define BKEY_PADDED(key) \
223 union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; }
224
225/* Version 0: Cache device
226 * Version 1: Backing device
227 * Version 2: Seed pointer into btree node checksum
228 * Version 3: Cache device with new UUID format
229 * Version 4: Backing device with data offset
230 */
231#define BCACHE_SB_VERSION_CDEV 0
232#define BCACHE_SB_VERSION_BDEV 1
233#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
234#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
235#define BCACHE_SB_MAX_VERSION 4
236
237#define SB_SECTOR 8
238#define SB_SIZE 4096
239#define SB_LABEL_SIZE 32
240#define SB_JOURNAL_BUCKETS 256U
241/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
242#define MAX_CACHES_PER_SET 8
243
244#define BDEV_DATA_START_DEFAULT 16 /* sectors */
245
246struct cache_sb {
247 uint64_t csum;
248 uint64_t offset; /* sector where this sb was written */
249 uint64_t version;
250
251 uint8_t magic[16];
252
253 uint8_t uuid[16];
254 union {
255 uint8_t set_uuid[16];
256 uint64_t set_magic;
257 };
258 uint8_t label[SB_LABEL_SIZE];
259
260 uint64_t flags;
261 uint64_t seq;
262 uint64_t pad[8];
263
264 union {
265 struct {
266 /* Cache devices */
267 uint64_t nbuckets; /* device size */
268
269 uint16_t block_size; /* sectors */
270 uint16_t bucket_size; /* sectors */
271
272 uint16_t nr_in_set;
273 uint16_t nr_this_dev;
274 };
275 struct {
276 /* Backing devices */
277 uint64_t data_offset;
278
279 /*
280 * block_size from the cache device section is still used by
281 * backing devices, so don't add anything here until we fix
282 * things to not need it for backing devices anymore
283 */
284 };
285 };
286
287 uint32_t last_mount; /* time_t */
288
289 uint16_t first_bucket;
290 union {
291 uint16_t njournal_buckets;
292 uint16_t keys;
293 };
294 uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */
295};
296
297BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1);
298BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1);
299BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3);
300#define CACHE_REPLACEMENT_LRU 0U
301#define CACHE_REPLACEMENT_FIFO 1U
302#define CACHE_REPLACEMENT_RANDOM 2U
303
304BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4);
305#define CACHE_MODE_WRITETHROUGH 0U
306#define CACHE_MODE_WRITEBACK 1U
307#define CACHE_MODE_WRITEAROUND 2U
308#define CACHE_MODE_NONE 3U
309BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2);
310#define BDEV_STATE_NONE 0U
311#define BDEV_STATE_CLEAN 1U
312#define BDEV_STATE_DIRTY 2U
313#define BDEV_STATE_STALE 3U
314
315/* Version 1: Seed pointer into btree node checksum
316 */
317#define BCACHE_BSET_VERSION 1
318
319/*
320 * This is the on disk format for btree nodes - a btree node on disk is a list
321 * of these; within each set the keys are sorted
322 */
323struct bset {
324 uint64_t csum;
325 uint64_t magic;
326 uint64_t seq;
327 uint32_t version;
328 uint32_t keys;
329
330 union {
331 struct bkey start[0];
332 uint64_t d[0];
333 };
334};
335
336/*
337 * On disk format for priorities and gens - see super.c near prio_write() for
338 * more.
339 */
340struct prio_set {
341 uint64_t csum;
342 uint64_t magic;
343 uint64_t seq;
344 uint32_t version;
345 uint32_t pad;
346
347 uint64_t next_bucket;
348
349 struct bucket_disk {
350 uint16_t prio;
351 uint8_t gen;
352 } __attribute((packed)) data[];
353};
354
355struct uuid_entry {
356 union {
357 struct {
358 uint8_t uuid[16];
359 uint8_t label[32];
360 uint32_t first_reg;
361 uint32_t last_reg;
362 uint32_t invalidated;
363
364 uint32_t flags;
365 /* Size of flash only volumes */
366 uint64_t sectors;
367 };
368
369 uint8_t pad[128];
370 };
371};
372
373BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1);
374
375#include "journal.h" 214#include "journal.h"
376#include "stats.h" 215#include "stats.h"
377struct search; 216struct search;
@@ -384,8 +223,6 @@ struct keybuf_key {
384 void *private; 223 void *private;
385}; 224};
386 225
387typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
388
389struct keybuf { 226struct keybuf {
390 struct bkey last_scanned; 227 struct bkey last_scanned;
391 spinlock_t lock; 228 spinlock_t lock;
@@ -400,7 +237,7 @@ struct keybuf {
400 237
401 struct rb_root keys; 238 struct rb_root keys;
402 239
403#define KEYBUF_NR 100 240#define KEYBUF_NR 500
404 DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); 241 DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
405}; 242};
406 243
@@ -429,16 +266,15 @@ struct bcache_device {
429 266
430 struct gendisk *disk; 267 struct gendisk *disk;
431 268
432 /* If nonzero, we're closing */ 269 unsigned long flags;
433 atomic_t closing; 270#define BCACHE_DEV_CLOSING 0
434 271#define BCACHE_DEV_DETACHING 1
435 /* If nonzero, we're detaching/unregistering from cache set */ 272#define BCACHE_DEV_UNLINK_DONE 2
436 atomic_t detaching;
437 int flush_done;
438 273
439 uint64_t nr_stripes; 274 unsigned nr_stripes;
440 unsigned stripe_size_bits; 275 unsigned stripe_size;
441 atomic_t *stripe_sectors_dirty; 276 atomic_t *stripe_sectors_dirty;
277 unsigned long *full_dirty_stripes;
442 278
443 unsigned long sectors_dirty_last; 279 unsigned long sectors_dirty_last;
444 long sectors_dirty_derivative; 280 long sectors_dirty_derivative;
@@ -509,7 +345,7 @@ struct cached_dev {
509 345
510 /* Limit number of writeback bios in flight */ 346 /* Limit number of writeback bios in flight */
511 struct semaphore in_flight; 347 struct semaphore in_flight;
512 struct closure_with_timer writeback; 348 struct task_struct *writeback_thread;
513 349
514 struct keybuf writeback_keys; 350 struct keybuf writeback_keys;
515 351
@@ -527,8 +363,8 @@ struct cached_dev {
527 unsigned sequential_cutoff; 363 unsigned sequential_cutoff;
528 unsigned readahead; 364 unsigned readahead;
529 365
530 unsigned sequential_merge:1;
531 unsigned verify:1; 366 unsigned verify:1;
367 unsigned bypass_torture_test:1;
532 368
533 unsigned partial_stripes_expensive:1; 369 unsigned partial_stripes_expensive:1;
534 unsigned writeback_metadata:1; 370 unsigned writeback_metadata:1;
@@ -620,15 +456,6 @@ struct cache {
620 456
621 bool discard; /* Get rid of? */ 457 bool discard; /* Get rid of? */
622 458
623 /*
624 * We preallocate structs for issuing discards to buckets, and keep them
625 * on this list when they're not in use; do_discard() issues discards
626 * whenever there's work to do and is called by free_some_buckets() and
627 * when a discard finishes.
628 */
629 atomic_t discards_in_flight;
630 struct list_head discards;
631
632 struct journal_device journal; 459 struct journal_device journal;
633 460
634 /* The rest of this all shows up in sysfs */ 461 /* The rest of this all shows up in sysfs */
@@ -649,7 +476,6 @@ struct gc_stat {
649 476
650 size_t nkeys; 477 size_t nkeys;
651 uint64_t data; /* sectors */ 478 uint64_t data; /* sectors */
652 uint64_t dirty; /* sectors */
653 unsigned in_use; /* percent */ 479 unsigned in_use; /* percent */
654}; 480};
655 481
@@ -744,8 +570,8 @@ struct cache_set {
744 * basically a lock for this that we can wait on asynchronously. The 570 * basically a lock for this that we can wait on asynchronously. The
745 * btree_root() macro releases the lock when it returns. 571 * btree_root() macro releases the lock when it returns.
746 */ 572 */
747 struct closure *try_harder; 573 struct task_struct *try_harder;
748 struct closure_waitlist try_wait; 574 wait_queue_head_t try_wait;
749 uint64_t try_harder_start; 575 uint64_t try_harder_start;
750 576
751 /* 577 /*
@@ -759,7 +585,7 @@ struct cache_set {
759 * written. 585 * written.
760 */ 586 */
761 atomic_t prio_blocked; 587 atomic_t prio_blocked;
762 struct closure_waitlist bucket_wait; 588 wait_queue_head_t bucket_wait;
763 589
764 /* 590 /*
765 * For any bio we don't skip we subtract the number of sectors from 591 * For any bio we don't skip we subtract the number of sectors from
@@ -782,7 +608,7 @@ struct cache_set {
782 struct gc_stat gc_stats; 608 struct gc_stat gc_stats;
783 size_t nbuckets; 609 size_t nbuckets;
784 610
785 struct closure_with_waitlist gc; 611 struct task_struct *gc_thread;
786 /* Where in the btree gc currently is */ 612 /* Where in the btree gc currently is */
787 struct bkey gc_done; 613 struct bkey gc_done;
788 614
@@ -795,11 +621,10 @@ struct cache_set {
795 /* Counts how many sectors bio_insert has added to the cache */ 621 /* Counts how many sectors bio_insert has added to the cache */
796 atomic_t sectors_to_gc; 622 atomic_t sectors_to_gc;
797 623
798 struct closure moving_gc; 624 wait_queue_head_t moving_gc_wait;
799 struct closure_waitlist moving_gc_wait;
800 struct keybuf moving_gc_keys; 625 struct keybuf moving_gc_keys;
801 /* Number of moving GC bios in flight */ 626 /* Number of moving GC bios in flight */
802 atomic_t in_flight; 627 struct semaphore moving_in_flight;
803 628
804 struct btree *root; 629 struct btree *root;
805 630
@@ -841,22 +666,27 @@ struct cache_set {
841 unsigned congested_read_threshold_us; 666 unsigned congested_read_threshold_us;
842 unsigned congested_write_threshold_us; 667 unsigned congested_write_threshold_us;
843 668
844 spinlock_t sort_time_lock;
845 struct time_stats sort_time; 669 struct time_stats sort_time;
846 struct time_stats btree_gc_time; 670 struct time_stats btree_gc_time;
847 struct time_stats btree_split_time; 671 struct time_stats btree_split_time;
848 spinlock_t btree_read_time_lock;
849 struct time_stats btree_read_time; 672 struct time_stats btree_read_time;
850 struct time_stats try_harder_time; 673 struct time_stats try_harder_time;
851 674
852 atomic_long_t cache_read_races; 675 atomic_long_t cache_read_races;
853 atomic_long_t writeback_keys_done; 676 atomic_long_t writeback_keys_done;
854 atomic_long_t writeback_keys_failed; 677 atomic_long_t writeback_keys_failed;
678
679 enum {
680 ON_ERROR_UNREGISTER,
681 ON_ERROR_PANIC,
682 } on_error;
855 unsigned error_limit; 683 unsigned error_limit;
856 unsigned error_decay; 684 unsigned error_decay;
685
857 unsigned short journal_delay_ms; 686 unsigned short journal_delay_ms;
858 unsigned verify:1; 687 unsigned verify:1;
859 unsigned key_merging_disabled:1; 688 unsigned key_merging_disabled:1;
689 unsigned expensive_debug_checks:1;
860 unsigned gc_always_rewrite:1; 690 unsigned gc_always_rewrite:1;
861 unsigned shrinker_disabled:1; 691 unsigned shrinker_disabled:1;
862 unsigned copy_gc_enabled:1; 692 unsigned copy_gc_enabled:1;
@@ -865,21 +695,6 @@ struct cache_set {
865 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; 695 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
866}; 696};
867 697
868static inline bool key_merging_disabled(struct cache_set *c)
869{
870#ifdef CONFIG_BCACHE_DEBUG
871 return c->key_merging_disabled;
872#else
873 return 0;
874#endif
875}
876
877static inline bool SB_IS_BDEV(const struct cache_sb *sb)
878{
879 return sb->version == BCACHE_SB_VERSION_BDEV
880 || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
881}
882
883struct bbio { 698struct bbio {
884 unsigned submit_time_us; 699 unsigned submit_time_us;
885 union { 700 union {
@@ -933,59 +748,6 @@ static inline unsigned local_clock_us(void)
933#define prio_buckets(c) \ 748#define prio_buckets(c) \
934 DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) 749 DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
935 750
936#define JSET_MAGIC 0x245235c1a3625032ULL
937#define PSET_MAGIC 0x6750e15f87337f91ULL
938#define BSET_MAGIC 0x90135c78b99e07f5ULL
939
940#define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC)
941#define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC)
942#define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC)
943
944/* Bkey fields: all units are in sectors */
945
946#define KEY_FIELD(name, field, offset, size) \
947 BITMASK(name, struct bkey, field, offset, size)
948
949#define PTR_FIELD(name, offset, size) \
950 static inline uint64_t name(const struct bkey *k, unsigned i) \
951 { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \
952 \
953 static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\
954 { \
955 k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \
956 k->ptr[i] |= v << offset; \
957 }
958
959KEY_FIELD(KEY_PTRS, high, 60, 3)
960KEY_FIELD(HEADER_SIZE, high, 58, 2)
961KEY_FIELD(KEY_CSUM, high, 56, 2)
962KEY_FIELD(KEY_PINNED, high, 55, 1)
963KEY_FIELD(KEY_DIRTY, high, 36, 1)
964
965KEY_FIELD(KEY_SIZE, high, 20, 16)
966KEY_FIELD(KEY_INODE, high, 0, 20)
967
968/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
969
970static inline uint64_t KEY_OFFSET(const struct bkey *k)
971{
972 return k->low;
973}
974
975static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v)
976{
977 k->low = v;
978}
979
980PTR_FIELD(PTR_DEV, 51, 12)
981PTR_FIELD(PTR_OFFSET, 8, 43)
982PTR_FIELD(PTR_GEN, 0, 8)
983
984#define PTR_CHECK_DEV ((1 << 12) - 1)
985
986#define PTR(gen, offset, dev) \
987 ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen)
988
989static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) 751static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
990{ 752{
991 return s >> c->bucket_bits; 753 return s >> c->bucket_bits;
@@ -1024,27 +786,11 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
1024 786
1025/* Btree key macros */ 787/* Btree key macros */
1026 788
1027/*
1028 * The high bit being set is a relic from when we used it to do binary
1029 * searches - it told you where a key started. It's not used anymore,
1030 * and can probably be safely dropped.
1031 */
1032#define KEY(dev, sector, len) \
1033((struct bkey) { \
1034 .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \
1035 .low = (sector) \
1036})
1037
1038static inline void bkey_init(struct bkey *k) 789static inline void bkey_init(struct bkey *k)
1039{ 790{
1040 *k = KEY(0, 0, 0); 791 *k = ZERO_KEY;
1041} 792}
1042 793
1043#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k))
1044#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0)
1045#define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0)
1046#define ZERO_KEY KEY(0, 0, 0)
1047
1048/* 794/*
1049 * This is used for various on disk data structures - cache_sb, prio_set, bset, 795 * This is used for various on disk data structures - cache_sb, prio_set, bset,
1050 * jset: The checksum is _always_ the first 8 bytes of these structs 796 * jset: The checksum is _always_ the first 8 bytes of these structs
@@ -1094,14 +840,6 @@ do { \
1094 for (b = (ca)->buckets + (ca)->sb.first_bucket; \ 840 for (b = (ca)->buckets + (ca)->sb.first_bucket; \
1095 b < (ca)->buckets + (ca)->sb.nbuckets; b++) 841 b < (ca)->buckets + (ca)->sb.nbuckets; b++)
1096 842
1097static inline void __bkey_put(struct cache_set *c, struct bkey *k)
1098{
1099 unsigned i;
1100
1101 for (i = 0; i < KEY_PTRS(k); i++)
1102 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
1103}
1104
1105static inline void cached_dev_put(struct cached_dev *dc) 843static inline void cached_dev_put(struct cached_dev *dc)
1106{ 844{
1107 if (atomic_dec_and_test(&dc->count)) 845 if (atomic_dec_and_test(&dc->count))
@@ -1173,13 +911,15 @@ uint8_t bch_inc_gen(struct cache *, struct bucket *);
1173void bch_rescale_priorities(struct cache_set *, int); 911void bch_rescale_priorities(struct cache_set *, int);
1174bool bch_bucket_add_unused(struct cache *, struct bucket *); 912bool bch_bucket_add_unused(struct cache *, struct bucket *);
1175 913
1176long bch_bucket_alloc(struct cache *, unsigned, struct closure *); 914long bch_bucket_alloc(struct cache *, unsigned, bool);
1177void bch_bucket_free(struct cache_set *, struct bkey *); 915void bch_bucket_free(struct cache_set *, struct bkey *);
1178 916
1179int __bch_bucket_alloc_set(struct cache_set *, unsigned, 917int __bch_bucket_alloc_set(struct cache_set *, unsigned,
1180 struct bkey *, int, struct closure *); 918 struct bkey *, int, bool);
1181int bch_bucket_alloc_set(struct cache_set *, unsigned, 919int bch_bucket_alloc_set(struct cache_set *, unsigned,
1182 struct bkey *, int, struct closure *); 920 struct bkey *, int, bool);
921bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned,
922 unsigned, unsigned, bool);
1183 923
1184__printf(2, 3) 924__printf(2, 3)
1185bool bch_cache_set_error(struct cache_set *, const char *, ...); 925bool bch_cache_set_error(struct cache_set *, const char *, ...);
@@ -1187,7 +927,7 @@ bool bch_cache_set_error(struct cache_set *, const char *, ...);
1187void bch_prio_write(struct cache *); 927void bch_prio_write(struct cache *);
1188void bch_write_bdev_super(struct cached_dev *, struct closure *); 928void bch_write_bdev_super(struct cached_dev *, struct closure *);
1189 929
1190extern struct workqueue_struct *bcache_wq, *bch_gc_wq; 930extern struct workqueue_struct *bcache_wq;
1191extern const char * const bch_cache_modes[]; 931extern const char * const bch_cache_modes[];
1192extern struct mutex bch_register_lock; 932extern struct mutex bch_register_lock;
1193extern struct list_head bch_cache_sets; 933extern struct list_head bch_cache_sets;
@@ -1220,15 +960,14 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *);
1220void bch_btree_cache_free(struct cache_set *); 960void bch_btree_cache_free(struct cache_set *);
1221int bch_btree_cache_alloc(struct cache_set *); 961int bch_btree_cache_alloc(struct cache_set *);
1222void bch_moving_init_cache_set(struct cache_set *); 962void bch_moving_init_cache_set(struct cache_set *);
963int bch_open_buckets_alloc(struct cache_set *);
964void bch_open_buckets_free(struct cache_set *);
1223 965
1224int bch_cache_allocator_start(struct cache *ca); 966int bch_cache_allocator_start(struct cache *ca);
1225void bch_cache_allocator_exit(struct cache *ca);
1226int bch_cache_allocator_init(struct cache *ca); 967int bch_cache_allocator_init(struct cache *ca);
1227 968
1228void bch_debug_exit(void); 969void bch_debug_exit(void);
1229int bch_debug_init(struct kobject *); 970int bch_debug_init(struct kobject *);
1230void bch_writeback_exit(void);
1231int bch_writeback_init(void);
1232void bch_request_exit(void); 971void bch_request_exit(void);
1233int bch_request_init(void); 972int bch_request_init(void);
1234void bch_btree_exit(void); 973void bch_btree_exit(void);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 22d1ae72c282..7d388b8bb50e 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -14,22 +14,12 @@
14 14
15/* Keylists */ 15/* Keylists */
16 16
17void bch_keylist_copy(struct keylist *dest, struct keylist *src)
18{
19 *dest = *src;
20
21 if (src->list == src->d) {
22 size_t n = (uint64_t *) src->top - src->d;
23 dest->top = (struct bkey *) &dest->d[n];
24 dest->list = dest->d;
25 }
26}
27
28int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) 17int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
29{ 18{
30 unsigned oldsize = (uint64_t *) l->top - l->list; 19 size_t oldsize = bch_keylist_nkeys(l);
31 unsigned newsize = oldsize + 2 + nptrs; 20 size_t newsize = oldsize + 2 + nptrs;
32 uint64_t *new; 21 uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p;
22 uint64_t *new_keys;
33 23
34 /* The journalling code doesn't handle the case where the keys to insert 24 /* The journalling code doesn't handle the case where the keys to insert
35 * is bigger than an empty write: If we just return -ENOMEM here, 25 * is bigger than an empty write: If we just return -ENOMEM here,
@@ -45,24 +35,23 @@ int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
45 roundup_pow_of_two(oldsize) == newsize) 35 roundup_pow_of_two(oldsize) == newsize)
46 return 0; 36 return 0;
47 37
48 new = krealloc(l->list == l->d ? NULL : l->list, 38 new_keys = krealloc(old_keys, sizeof(uint64_t) * newsize, GFP_NOIO);
49 sizeof(uint64_t) * newsize, GFP_NOIO);
50 39
51 if (!new) 40 if (!new_keys)
52 return -ENOMEM; 41 return -ENOMEM;
53 42
54 if (l->list == l->d) 43 if (!old_keys)
55 memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE); 44 memcpy(new_keys, l->inline_keys, sizeof(uint64_t) * oldsize);
56 45
57 l->list = new; 46 l->keys_p = new_keys;
58 l->top = (struct bkey *) (&l->list[oldsize]); 47 l->top_p = new_keys + oldsize;
59 48
60 return 0; 49 return 0;
61} 50}
62 51
63struct bkey *bch_keylist_pop(struct keylist *l) 52struct bkey *bch_keylist_pop(struct keylist *l)
64{ 53{
65 struct bkey *k = l->bottom; 54 struct bkey *k = l->keys;
66 55
67 if (k == l->top) 56 if (k == l->top)
68 return NULL; 57 return NULL;
@@ -73,21 +62,20 @@ struct bkey *bch_keylist_pop(struct keylist *l)
73 return l->top = k; 62 return l->top = k;
74} 63}
75 64
76/* Pointer validation */ 65void bch_keylist_pop_front(struct keylist *l)
77
78bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
79{ 66{
80 unsigned i; 67 l->top_p -= bkey_u64s(l->keys);
81 char buf[80];
82 68
83 if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) 69 memmove(l->keys,
84 goto bad; 70 bkey_next(l->keys),
71 bch_keylist_bytes(l));
72}
85 73
86 if (!level && KEY_SIZE(k) > KEY_OFFSET(k)) 74/* Pointer validation */
87 goto bad;
88 75
89 if (!KEY_SIZE(k)) 76static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
90 return true; 77{
78 unsigned i;
91 79
92 for (i = 0; i < KEY_PTRS(k); i++) 80 for (i = 0; i < KEY_PTRS(k); i++)
93 if (ptr_available(c, k, i)) { 81 if (ptr_available(c, k, i)) {
@@ -98,13 +86,83 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
98 if (KEY_SIZE(k) + r > c->sb.bucket_size || 86 if (KEY_SIZE(k) + r > c->sb.bucket_size ||
99 bucket < ca->sb.first_bucket || 87 bucket < ca->sb.first_bucket ||
100 bucket >= ca->sb.nbuckets) 88 bucket >= ca->sb.nbuckets)
101 goto bad; 89 return true;
102 } 90 }
103 91
104 return false; 92 return false;
93}
94
95bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
96{
97 char buf[80];
98
99 if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
100 goto bad;
101
102 if (__ptr_invalid(c, k))
103 goto bad;
104
105 return false;
106bad:
107 bch_bkey_to_text(buf, sizeof(buf), k);
108 cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
109 return true;
110}
111
112bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k)
113{
114 char buf[80];
115
116 if (!KEY_SIZE(k))
117 return true;
118
119 if (KEY_SIZE(k) > KEY_OFFSET(k))
120 goto bad;
121
122 if (__ptr_invalid(c, k))
123 goto bad;
124
125 return false;
105bad: 126bad:
106 bch_bkey_to_text(buf, sizeof(buf), k); 127 bch_bkey_to_text(buf, sizeof(buf), k);
107 cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k)); 128 cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
129 return true;
130}
131
132static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k,
133 unsigned ptr)
134{
135 struct bucket *g = PTR_BUCKET(b->c, k, ptr);
136 char buf[80];
137
138 if (mutex_trylock(&b->c->bucket_lock)) {
139 if (b->level) {
140 if (KEY_DIRTY(k) ||
141 g->prio != BTREE_PRIO ||
142 (b->c->gc_mark_valid &&
143 GC_MARK(g) != GC_MARK_METADATA))
144 goto err;
145
146 } else {
147 if (g->prio == BTREE_PRIO)
148 goto err;
149
150 if (KEY_DIRTY(k) &&
151 b->c->gc_mark_valid &&
152 GC_MARK(g) != GC_MARK_DIRTY)
153 goto err;
154 }
155 mutex_unlock(&b->c->bucket_lock);
156 }
157
158 return false;
159err:
160 mutex_unlock(&b->c->bucket_lock);
161 bch_bkey_to_text(buf, sizeof(buf), k);
162 btree_bug(b,
163"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
164 buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
165 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
108 return true; 166 return true;
109} 167}
110 168
@@ -118,64 +176,29 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k)
118 bch_ptr_invalid(b, k)) 176 bch_ptr_invalid(b, k))
119 return true; 177 return true;
120 178
121 if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV) 179 for (i = 0; i < KEY_PTRS(k); i++) {
122 return true; 180 if (!ptr_available(b->c, k, i))
181 return true;
123 182
124 for (i = 0; i < KEY_PTRS(k); i++) 183 g = PTR_BUCKET(b->c, k, i);
125 if (ptr_available(b->c, k, i)) { 184 stale = ptr_stale(b->c, k, i);
126 g = PTR_BUCKET(b->c, k, i);
127 stale = ptr_stale(b->c, k, i);
128 185
129 btree_bug_on(stale > 96, b, 186 btree_bug_on(stale > 96, b,
130 "key too stale: %i, need_gc %u", 187 "key too stale: %i, need_gc %u",
131 stale, b->c->need_gc); 188 stale, b->c->need_gc);
132 189
133 btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), 190 btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
134 b, "stale dirty pointer"); 191 b, "stale dirty pointer");
135 192
136 if (stale) 193 if (stale)
137 return true; 194 return true;
138 195
139#ifdef CONFIG_BCACHE_EDEBUG 196 if (expensive_debug_checks(b->c) &&
140 if (!mutex_trylock(&b->c->bucket_lock)) 197 ptr_bad_expensive_checks(b, k, i))
141 continue; 198 return true;
142 199 }
143 if (b->level) {
144 if (KEY_DIRTY(k) ||
145 g->prio != BTREE_PRIO ||
146 (b->c->gc_mark_valid &&
147 GC_MARK(g) != GC_MARK_METADATA))
148 goto bug;
149
150 } else {
151 if (g->prio == BTREE_PRIO)
152 goto bug;
153
154 if (KEY_DIRTY(k) &&
155 b->c->gc_mark_valid &&
156 GC_MARK(g) != GC_MARK_DIRTY)
157 goto bug;
158 }
159 mutex_unlock(&b->c->bucket_lock);
160#endif
161 }
162 200
163 return false; 201 return false;
164#ifdef CONFIG_BCACHE_EDEBUG
165bug:
166 mutex_unlock(&b->c->bucket_lock);
167
168 {
169 char buf[80];
170
171 bch_bkey_to_text(buf, sizeof(buf), k);
172 btree_bug(b,
173"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
174 buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
175 g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
176 }
177 return true;
178#endif
179} 202}
180 203
181/* Key/pointer manipulation */ 204/* Key/pointer manipulation */
@@ -458,16 +481,8 @@ static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline)
458 481
459static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) 482static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
460{ 483{
461#ifdef CONFIG_X86_64
462 asm("shrd %[shift],%[high],%[low]"
463 : [low] "+Rm" (low)
464 : [high] "R" (high),
465 [shift] "ci" (shift)
466 : "cc");
467#else
468 low >>= shift; 484 low >>= shift;
469 low |= (high << 1) << (63U - shift); 485 low |= (high << 1) << (63U - shift);
470#endif
471 return low; 486 return low;
472} 487}
473 488
@@ -686,7 +701,7 @@ void bch_bset_init_next(struct btree *b)
686 } else 701 } else
687 get_random_bytes(&i->seq, sizeof(uint64_t)); 702 get_random_bytes(&i->seq, sizeof(uint64_t));
688 703
689 i->magic = bset_magic(b->c); 704 i->magic = bset_magic(&b->c->sb);
690 i->version = 0; 705 i->version = 0;
691 i->keys = 0; 706 i->keys = 0;
692 707
@@ -824,16 +839,16 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
824 } else 839 } else
825 i = bset_search_write_set(b, t, search); 840 i = bset_search_write_set(b, t, search);
826 841
827#ifdef CONFIG_BCACHE_EDEBUG 842 if (expensive_debug_checks(b->c)) {
828 BUG_ON(bset_written(b, t) && 843 BUG_ON(bset_written(b, t) &&
829 i.l != t->data->start && 844 i.l != t->data->start &&
830 bkey_cmp(tree_to_prev_bkey(t, 845 bkey_cmp(tree_to_prev_bkey(t,
831 inorder_to_tree(bkey_to_cacheline(t, i.l), t)), 846 inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
832 search) > 0); 847 search) > 0);
833 848
834 BUG_ON(i.r != end(t->data) && 849 BUG_ON(i.r != end(t->data) &&
835 bkey_cmp(i.r, search) <= 0); 850 bkey_cmp(i.r, search) <= 0);
836#endif 851 }
837 852
838 while (likely(i.l != i.r) && 853 while (likely(i.l != i.r) &&
839 bkey_cmp(i.l, search) <= 0) 854 bkey_cmp(i.l, search) <= 0)
@@ -844,6 +859,13 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
844 859
845/* Btree iterator */ 860/* Btree iterator */
846 861
862/*
863 * Returns true if l > r - unless l == r, in which case returns true if l is
864 * older than r.
865 *
866 * Necessary for btree_sort_fixup() - if there are multiple keys that compare
867 * equal in different sets, we have to process them newest to oldest.
868 */
847static inline bool btree_iter_cmp(struct btree_iter_set l, 869static inline bool btree_iter_cmp(struct btree_iter_set l,
848 struct btree_iter_set r) 870 struct btree_iter_set r)
849{ 871{
@@ -867,12 +889,16 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
867} 889}
868 890
869struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, 891struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
870 struct bkey *search, struct bset_tree *start) 892 struct bkey *search, struct bset_tree *start)
871{ 893{
872 struct bkey *ret = NULL; 894 struct bkey *ret = NULL;
873 iter->size = ARRAY_SIZE(iter->data); 895 iter->size = ARRAY_SIZE(iter->data);
874 iter->used = 0; 896 iter->used = 0;
875 897
898#ifdef CONFIG_BCACHE_DEBUG
899 iter->b = b;
900#endif
901
876 for (; start <= &b->sets[b->nsets]; start++) { 902 for (; start <= &b->sets[b->nsets]; start++) {
877 ret = bch_bset_search(b, start, search); 903 ret = bch_bset_search(b, start, search);
878 bch_btree_iter_push(iter, ret, end(start->data)); 904 bch_btree_iter_push(iter, ret, end(start->data));
@@ -887,6 +913,8 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter)
887 struct bkey *ret = NULL; 913 struct bkey *ret = NULL;
888 914
889 if (!btree_iter_end(iter)) { 915 if (!btree_iter_end(iter)) {
916 bch_btree_iter_next_check(iter);
917
890 ret = iter->data->k; 918 ret = iter->data->k;
891 iter->data->k = bkey_next(iter->data->k); 919 iter->data->k = bkey_next(iter->data->k);
892 920
@@ -916,14 +944,6 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
916 return ret; 944 return ret;
917} 945}
918 946
919struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search)
920{
921 struct btree_iter iter;
922
923 bch_btree_iter_init(b, &iter, search);
924 return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
925}
926
927/* Mergesort */ 947/* Mergesort */
928 948
929static void sort_key_next(struct btree_iter *iter, 949static void sort_key_next(struct btree_iter *iter,
@@ -998,7 +1018,6 @@ static void btree_mergesort(struct btree *b, struct bset *out,
998 out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0; 1018 out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0;
999 1019
1000 pr_debug("sorted %i keys", out->keys); 1020 pr_debug("sorted %i keys", out->keys);
1001 bch_check_key_order(b, out);
1002} 1021}
1003 1022
1004static void __btree_sort(struct btree *b, struct btree_iter *iter, 1023static void __btree_sort(struct btree *b, struct btree_iter *iter,
@@ -1029,7 +1048,7 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
1029 * memcpy() 1048 * memcpy()
1030 */ 1049 */
1031 1050
1032 out->magic = bset_magic(b->c); 1051 out->magic = bset_magic(&b->c->sb);
1033 out->seq = b->sets[0].data->seq; 1052 out->seq = b->sets[0].data->seq;
1034 out->version = b->sets[0].data->version; 1053 out->version = b->sets[0].data->version;
1035 swap(out, b->sets[0].data); 1054 swap(out, b->sets[0].data);
@@ -1050,24 +1069,21 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
1050 if (b->written) 1069 if (b->written)
1051 bset_build_written_tree(b); 1070 bset_build_written_tree(b);
1052 1071
1053 if (!start) { 1072 if (!start)
1054 spin_lock(&b->c->sort_time_lock);
1055 bch_time_stats_update(&b->c->sort_time, start_time); 1073 bch_time_stats_update(&b->c->sort_time, start_time);
1056 spin_unlock(&b->c->sort_time_lock);
1057 }
1058} 1074}
1059 1075
1060void bch_btree_sort_partial(struct btree *b, unsigned start) 1076void bch_btree_sort_partial(struct btree *b, unsigned start)
1061{ 1077{
1062 size_t oldsize = 0, order = b->page_order, keys = 0; 1078 size_t order = b->page_order, keys = 0;
1063 struct btree_iter iter; 1079 struct btree_iter iter;
1080 int oldsize = bch_count_data(b);
1081
1064 __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); 1082 __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]);
1065 1083
1066 BUG_ON(b->sets[b->nsets].data == write_block(b) && 1084 BUG_ON(b->sets[b->nsets].data == write_block(b) &&
1067 (b->sets[b->nsets].size || b->nsets)); 1085 (b->sets[b->nsets].size || b->nsets));
1068 1086
1069 if (b->written)
1070 oldsize = bch_count_data(b);
1071 1087
1072 if (start) { 1088 if (start) {
1073 unsigned i; 1089 unsigned i;
@@ -1083,7 +1099,7 @@ void bch_btree_sort_partial(struct btree *b, unsigned start)
1083 1099
1084 __btree_sort(b, &iter, start, order, false); 1100 __btree_sort(b, &iter, start, order, false);
1085 1101
1086 EBUG_ON(b->written && bch_count_data(b) != oldsize); 1102 EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize);
1087} 1103}
1088 1104
1089void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) 1105void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter)
@@ -1101,9 +1117,7 @@ void bch_btree_sort_into(struct btree *b, struct btree *new)
1101 1117
1102 btree_mergesort(b, new->sets->data, &iter, false, true); 1118 btree_mergesort(b, new->sets->data, &iter, false, true);
1103 1119
1104 spin_lock(&b->c->sort_time_lock);
1105 bch_time_stats_update(&b->c->sort_time, start_time); 1120 bch_time_stats_update(&b->c->sort_time, start_time);
1106 spin_unlock(&b->c->sort_time_lock);
1107 1121
1108 bkey_copy_key(&new->key, &b->key); 1122 bkey_copy_key(&new->key, &b->key);
1109 new->sets->size = 0; 1123 new->sets->size = 0;
@@ -1148,16 +1162,16 @@ out:
1148/* Sysfs stuff */ 1162/* Sysfs stuff */
1149 1163
1150struct bset_stats { 1164struct bset_stats {
1165 struct btree_op op;
1151 size_t nodes; 1166 size_t nodes;
1152 size_t sets_written, sets_unwritten; 1167 size_t sets_written, sets_unwritten;
1153 size_t bytes_written, bytes_unwritten; 1168 size_t bytes_written, bytes_unwritten;
1154 size_t floats, failed; 1169 size_t floats, failed;
1155}; 1170};
1156 1171
1157static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, 1172static int btree_bset_stats(struct btree_op *op, struct btree *b)
1158 struct bset_stats *stats)
1159{ 1173{
1160 struct bkey *k; 1174 struct bset_stats *stats = container_of(op, struct bset_stats, op);
1161 unsigned i; 1175 unsigned i;
1162 1176
1163 stats->nodes++; 1177 stats->nodes++;
@@ -1182,30 +1196,19 @@ static int bch_btree_bset_stats(struct btree *b, struct btree_op *op,
1182 } 1196 }
1183 } 1197 }
1184 1198
1185 if (b->level) { 1199 return MAP_CONTINUE;
1186 struct btree_iter iter;
1187
1188 for_each_key_filter(b, k, &iter, bch_ptr_bad) {
1189 int ret = btree(bset_stats, k, b, op, stats);
1190 if (ret)
1191 return ret;
1192 }
1193 }
1194
1195 return 0;
1196} 1200}
1197 1201
1198int bch_bset_print_stats(struct cache_set *c, char *buf) 1202int bch_bset_print_stats(struct cache_set *c, char *buf)
1199{ 1203{
1200 struct btree_op op;
1201 struct bset_stats t; 1204 struct bset_stats t;
1202 int ret; 1205 int ret;
1203 1206
1204 bch_btree_op_init_stack(&op);
1205 memset(&t, 0, sizeof(struct bset_stats)); 1207 memset(&t, 0, sizeof(struct bset_stats));
1208 bch_btree_op_init(&t.op, -1);
1206 1209
1207 ret = btree_root(bset_stats, c, &op, &t); 1210 ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
1208 if (ret) 1211 if (ret < 0)
1209 return ret; 1212 return ret;
1210 1213
1211 return snprintf(buf, PAGE_SIZE, 1214 return snprintf(buf, PAGE_SIZE,
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index ae115a253d73..1d3c24f9fa0e 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -148,6 +148,9 @@
148 148
149struct btree_iter { 149struct btree_iter {
150 size_t size, used; 150 size_t size, used;
151#ifdef CONFIG_BCACHE_DEBUG
152 struct btree *b;
153#endif
151 struct btree_iter_set { 154 struct btree_iter_set {
152 struct bkey *k, *end; 155 struct bkey *k, *end;
153 } data[MAX_BSETS]; 156 } data[MAX_BSETS];
@@ -193,54 +196,26 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l,
193 : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); 196 : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
194} 197}
195 198
196static inline size_t bkey_u64s(const struct bkey *k)
197{
198 BUG_ON(KEY_CSUM(k) > 1);
199 return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0);
200}
201
202static inline size_t bkey_bytes(const struct bkey *k)
203{
204 return bkey_u64s(k) * sizeof(uint64_t);
205}
206
207static inline void bkey_copy(struct bkey *dest, const struct bkey *src)
208{
209 memcpy(dest, src, bkey_bytes(src));
210}
211
212static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
213{
214 if (!src)
215 src = &KEY(0, 0, 0);
216
217 SET_KEY_INODE(dest, KEY_INODE(src));
218 SET_KEY_OFFSET(dest, KEY_OFFSET(src));
219}
220
221static inline struct bkey *bkey_next(const struct bkey *k)
222{
223 uint64_t *d = (void *) k;
224 return (struct bkey *) (d + bkey_u64s(k));
225}
226
227/* Keylists */ 199/* Keylists */
228 200
229struct keylist { 201struct keylist {
230 struct bkey *top;
231 union { 202 union {
232 uint64_t *list; 203 struct bkey *keys;
233 struct bkey *bottom; 204 uint64_t *keys_p;
205 };
206 union {
207 struct bkey *top;
208 uint64_t *top_p;
234 }; 209 };
235 210
236 /* Enough room for btree_split's keys without realloc */ 211 /* Enough room for btree_split's keys without realloc */
237#define KEYLIST_INLINE 16 212#define KEYLIST_INLINE 16
238 uint64_t d[KEYLIST_INLINE]; 213 uint64_t inline_keys[KEYLIST_INLINE];
239}; 214};
240 215
241static inline void bch_keylist_init(struct keylist *l) 216static inline void bch_keylist_init(struct keylist *l)
242{ 217{
243 l->top = (void *) (l->list = l->d); 218 l->top_p = l->keys_p = l->inline_keys;
244} 219}
245 220
246static inline void bch_keylist_push(struct keylist *l) 221static inline void bch_keylist_push(struct keylist *l)
@@ -256,17 +231,32 @@ static inline void bch_keylist_add(struct keylist *l, struct bkey *k)
256 231
257static inline bool bch_keylist_empty(struct keylist *l) 232static inline bool bch_keylist_empty(struct keylist *l)
258{ 233{
259 return l->top == (void *) l->list; 234 return l->top == l->keys;
235}
236
237static inline void bch_keylist_reset(struct keylist *l)
238{
239 l->top = l->keys;
260} 240}
261 241
262static inline void bch_keylist_free(struct keylist *l) 242static inline void bch_keylist_free(struct keylist *l)
263{ 243{
264 if (l->list != l->d) 244 if (l->keys_p != l->inline_keys)
265 kfree(l->list); 245 kfree(l->keys_p);
246}
247
248static inline size_t bch_keylist_nkeys(struct keylist *l)
249{
250 return l->top_p - l->keys_p;
251}
252
253static inline size_t bch_keylist_bytes(struct keylist *l)
254{
255 return bch_keylist_nkeys(l) * sizeof(uint64_t);
266} 256}
267 257
268void bch_keylist_copy(struct keylist *, struct keylist *);
269struct bkey *bch_keylist_pop(struct keylist *); 258struct bkey *bch_keylist_pop(struct keylist *);
259void bch_keylist_pop_front(struct keylist *);
270int bch_keylist_realloc(struct keylist *, int, struct cache_set *); 260int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
271 261
272void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, 262void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
@@ -287,7 +277,9 @@ static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
287} 277}
288 278
289const char *bch_ptr_status(struct cache_set *, const struct bkey *); 279const char *bch_ptr_status(struct cache_set *, const struct bkey *);
290bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *); 280bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
281bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *);
282
291bool bch_ptr_bad(struct btree *, const struct bkey *); 283bool bch_ptr_bad(struct btree *, const struct bkey *);
292 284
293static inline uint8_t gen_after(uint8_t a, uint8_t b) 285static inline uint8_t gen_after(uint8_t a, uint8_t b)
@@ -311,7 +303,6 @@ static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
311 303
312typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); 304typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
313 305
314struct bkey *bch_next_recurse_key(struct btree *, struct bkey *);
315struct bkey *bch_btree_iter_next(struct btree_iter *); 306struct bkey *bch_btree_iter_next(struct btree_iter *);
316struct bkey *bch_btree_iter_next_filter(struct btree_iter *, 307struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
317 struct btree *, ptr_filter_fn); 308 struct btree *, ptr_filter_fn);
@@ -361,12 +352,30 @@ void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
361struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, 352struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
362 const struct bkey *); 353 const struct bkey *);
363 354
355/*
356 * Returns the first key that is strictly greater than search
357 */
364static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, 358static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
365 const struct bkey *search) 359 const struct bkey *search)
366{ 360{
367 return search ? __bch_bset_search(b, t, search) : t->data->start; 361 return search ? __bch_bset_search(b, t, search) : t->data->start;
368} 362}
369 363
364#define PRECEDING_KEY(_k) \
365({ \
366 struct bkey *_ret = NULL; \
367 \
368 if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
369 _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
370 \
371 if (!_ret->low) \
372 _ret->high--; \
373 _ret->low--; \
374 } \
375 \
376 _ret; \
377})
378
370bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); 379bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
371void bch_btree_sort_lazy(struct btree *); 380void bch_btree_sort_lazy(struct btree *);
372void bch_btree_sort_into(struct btree *, struct btree *); 381void bch_btree_sort_into(struct btree *, struct btree *);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f42fc7ed9cd6..5e2765aadce1 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -23,12 +23,13 @@
23#include "bcache.h" 23#include "bcache.h"
24#include "btree.h" 24#include "btree.h"
25#include "debug.h" 25#include "debug.h"
26#include "request.h"
27#include "writeback.h" 26#include "writeback.h"
28 27
29#include <linux/slab.h> 28#include <linux/slab.h>
30#include <linux/bitops.h> 29#include <linux/bitops.h>
30#include <linux/freezer.h>
31#include <linux/hash.h> 31#include <linux/hash.h>
32#include <linux/kthread.h>
32#include <linux/prefetch.h> 33#include <linux/prefetch.h>
33#include <linux/random.h> 34#include <linux/random.h>
34#include <linux/rcupdate.h> 35#include <linux/rcupdate.h>
@@ -88,15 +89,13 @@
88 * Test module load/unload 89 * Test module load/unload
89 */ 90 */
90 91
91static const char * const op_types[] = { 92enum {
92 "insert", "replace" 93 BTREE_INSERT_STATUS_INSERT,
94 BTREE_INSERT_STATUS_BACK_MERGE,
95 BTREE_INSERT_STATUS_OVERWROTE,
96 BTREE_INSERT_STATUS_FRONT_MERGE,
93}; 97};
94 98
95static const char *op_type(struct btree_op *op)
96{
97 return op_types[op->type];
98}
99
100#define MAX_NEED_GC 64 99#define MAX_NEED_GC 64
101#define MAX_SAVE_PRIO 72 100#define MAX_SAVE_PRIO 72
102 101
@@ -105,23 +104,89 @@ static const char *op_type(struct btree_op *op)
105#define PTR_HASH(c, k) \ 104#define PTR_HASH(c, k) \
106 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) 105 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
107 106
108struct workqueue_struct *bch_gc_wq;
109static struct workqueue_struct *btree_io_wq; 107static struct workqueue_struct *btree_io_wq;
110 108
111void bch_btree_op_init_stack(struct btree_op *op) 109static inline bool should_split(struct btree *b)
112{ 110{
113 memset(op, 0, sizeof(struct btree_op)); 111 struct bset *i = write_block(b);
114 closure_init_stack(&op->cl); 112 return b->written >= btree_blocks(b) ||
115 op->lock = -1; 113 (b->written + __set_blocks(i, i->keys + 15, b->c)
116 bch_keylist_init(&op->keys); 114 > btree_blocks(b));
117} 115}
118 116
117#define insert_lock(s, b) ((b)->level <= (s)->lock)
118
119/*
120 * These macros are for recursing down the btree - they handle the details of
121 * locking and looking up nodes in the cache for you. They're best treated as
122 * mere syntax when reading code that uses them.
123 *
124 * op->lock determines whether we take a read or a write lock at a given depth.
125 * If you've got a read lock and find that you need a write lock (i.e. you're
126 * going to have to split), set op->lock and return -EINTR; btree_root() will
127 * call you again and you'll have the correct lock.
128 */
129
130/**
131 * btree - recurse down the btree on a specified key
132 * @fn: function to call, which will be passed the child node
133 * @key: key to recurse on
134 * @b: parent btree node
135 * @op: pointer to struct btree_op
136 */
137#define btree(fn, key, b, op, ...) \
138({ \
139 int _r, l = (b)->level - 1; \
140 bool _w = l <= (op)->lock; \
141 struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \
142 if (!IS_ERR(_child)) { \
143 _child->parent = (b); \
144 _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
145 rw_unlock(_w, _child); \
146 } else \
147 _r = PTR_ERR(_child); \
148 _r; \
149})
150
151/**
152 * btree_root - call a function on the root of the btree
153 * @fn: function to call, which will be passed the child node
154 * @c: cache set
155 * @op: pointer to struct btree_op
156 */
157#define btree_root(fn, c, op, ...) \
158({ \
159 int _r = -EINTR; \
160 do { \
161 struct btree *_b = (c)->root; \
162 bool _w = insert_lock(op, _b); \
163 rw_lock(_w, _b, _b->level); \
164 if (_b == (c)->root && \
165 _w == insert_lock(op, _b)) { \
166 _b->parent = NULL; \
167 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
168 } \
169 rw_unlock(_w, _b); \
170 bch_cannibalize_unlock(c); \
171 if (_r == -ENOSPC) { \
172 wait_event((c)->try_wait, \
173 !(c)->try_harder); \
174 _r = -EINTR; \
175 } \
176 } while (_r == -EINTR); \
177 \
178 _r; \
179})
180
119/* Btree key manipulation */ 181/* Btree key manipulation */
120 182
121static void bkey_put(struct cache_set *c, struct bkey *k, int level) 183void bkey_put(struct cache_set *c, struct bkey *k)
122{ 184{
123 if ((level && KEY_OFFSET(k)) || !level) 185 unsigned i;
124 __bkey_put(c, k); 186
187 for (i = 0; i < KEY_PTRS(k); i++)
188 if (ptr_available(c, k, i))
189 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
125} 190}
126 191
127/* Btree IO */ 192/* Btree IO */
@@ -145,6 +210,10 @@ static void bch_btree_node_read_done(struct btree *b)
145 iter->size = b->c->sb.bucket_size / b->c->sb.block_size; 210 iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
146 iter->used = 0; 211 iter->used = 0;
147 212
213#ifdef CONFIG_BCACHE_DEBUG
214 iter->b = b;
215#endif
216
148 if (!i->seq) 217 if (!i->seq)
149 goto err; 218 goto err;
150 219
@@ -160,7 +229,7 @@ static void bch_btree_node_read_done(struct btree *b)
160 goto err; 229 goto err;
161 230
162 err = "bad magic"; 231 err = "bad magic";
163 if (i->magic != bset_magic(b->c)) 232 if (i->magic != bset_magic(&b->c->sb))
164 goto err; 233 goto err;
165 234
166 err = "bad checksum"; 235 err = "bad checksum";
@@ -248,10 +317,7 @@ void bch_btree_node_read(struct btree *b)
248 goto err; 317 goto err;
249 318
250 bch_btree_node_read_done(b); 319 bch_btree_node_read_done(b);
251
252 spin_lock(&b->c->btree_read_time_lock);
253 bch_time_stats_update(&b->c->btree_read_time, start_time); 320 bch_time_stats_update(&b->c->btree_read_time, start_time);
254 spin_unlock(&b->c->btree_read_time_lock);
255 321
256 return; 322 return;
257err: 323err:
@@ -327,7 +393,7 @@ static void do_btree_node_write(struct btree *b)
327 b->bio = bch_bbio_alloc(b->c); 393 b->bio = bch_bbio_alloc(b->c);
328 394
329 b->bio->bi_end_io = btree_node_write_endio; 395 b->bio->bi_end_io = btree_node_write_endio;
330 b->bio->bi_private = &b->io.cl; 396 b->bio->bi_private = cl;
331 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; 397 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
332 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); 398 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
333 bch_bio_map(b->bio, i); 399 bch_bio_map(b->bio, i);
@@ -383,7 +449,7 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
383 BUG_ON(b->written >= btree_blocks(b)); 449 BUG_ON(b->written >= btree_blocks(b));
384 BUG_ON(b->written && !i->keys); 450 BUG_ON(b->written && !i->keys);
385 BUG_ON(b->sets->data->seq != i->seq); 451 BUG_ON(b->sets->data->seq != i->seq);
386 bch_check_key_order(b, i); 452 bch_check_keys(b, "writing");
387 453
388 cancel_delayed_work(&b->work); 454 cancel_delayed_work(&b->work);
389 455
@@ -405,6 +471,15 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
405 bch_bset_init_next(b); 471 bch_bset_init_next(b);
406} 472}
407 473
474static void bch_btree_node_write_sync(struct btree *b)
475{
476 struct closure cl;
477
478 closure_init_stack(&cl);
479 bch_btree_node_write(b, &cl);
480 closure_sync(&cl);
481}
482
408static void btree_node_write_work(struct work_struct *w) 483static void btree_node_write_work(struct work_struct *w)
409{ 484{
410 struct btree *b = container_of(to_delayed_work(w), struct btree, work); 485 struct btree *b = container_of(to_delayed_work(w), struct btree, work);
@@ -416,7 +491,7 @@ static void btree_node_write_work(struct work_struct *w)
416 rw_unlock(true, b); 491 rw_unlock(true, b);
417} 492}
418 493
419static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) 494static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
420{ 495{
421 struct bset *i = b->sets[b->nsets].data; 496 struct bset *i = b->sets[b->nsets].data;
422 struct btree_write *w = btree_current_write(b); 497 struct btree_write *w = btree_current_write(b);
@@ -429,15 +504,15 @@ static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
429 504
430 set_btree_node_dirty(b); 505 set_btree_node_dirty(b);
431 506
432 if (op && op->journal) { 507 if (journal_ref) {
433 if (w->journal && 508 if (w->journal &&
434 journal_pin_cmp(b->c, w, op)) { 509 journal_pin_cmp(b->c, w->journal, journal_ref)) {
435 atomic_dec_bug(w->journal); 510 atomic_dec_bug(w->journal);
436 w->journal = NULL; 511 w->journal = NULL;
437 } 512 }
438 513
439 if (!w->journal) { 514 if (!w->journal) {
440 w->journal = op->journal; 515 w->journal = journal_ref;
441 atomic_inc(w->journal); 516 atomic_inc(w->journal);
442 } 517 }
443 } 518 }
@@ -566,33 +641,32 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
566 return b; 641 return b;
567} 642}
568 643
569static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) 644static int mca_reap(struct btree *b, unsigned min_order, bool flush)
570{ 645{
646 struct closure cl;
647
648 closure_init_stack(&cl);
571 lockdep_assert_held(&b->c->bucket_lock); 649 lockdep_assert_held(&b->c->bucket_lock);
572 650
573 if (!down_write_trylock(&b->lock)) 651 if (!down_write_trylock(&b->lock))
574 return -ENOMEM; 652 return -ENOMEM;
575 653
576 if (b->page_order < min_order) { 654 BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
655
656 if (b->page_order < min_order ||
657 (!flush &&
658 (btree_node_dirty(b) ||
659 atomic_read(&b->io.cl.remaining) != -1))) {
577 rw_unlock(true, b); 660 rw_unlock(true, b);
578 return -ENOMEM; 661 return -ENOMEM;
579 } 662 }
580 663
581 BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 664 if (btree_node_dirty(b))
582 665 bch_btree_node_write_sync(b);
583 if (cl && btree_node_dirty(b))
584 bch_btree_node_write(b, NULL);
585
586 if (cl)
587 closure_wait_event_async(&b->io.wait, cl,
588 atomic_read(&b->io.cl.remaining) == -1);
589 666
590 if (btree_node_dirty(b) || 667 /* wait for any in flight btree write */
591 !closure_is_unlocked(&b->io.cl) || 668 closure_wait_event(&b->io.wait, &cl,
592 work_pending(&b->work.work)) { 669 atomic_read(&b->io.cl.remaining) == -1);
593 rw_unlock(true, b);
594 return -EAGAIN;
595 }
596 670
597 return 0; 671 return 0;
598} 672}
@@ -633,7 +707,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
633 break; 707 break;
634 708
635 if (++i > 3 && 709 if (++i > 3 &&
636 !mca_reap(b, NULL, 0)) { 710 !mca_reap(b, 0, false)) {
637 mca_data_free(b); 711 mca_data_free(b);
638 rw_unlock(true, b); 712 rw_unlock(true, b);
639 freed++; 713 freed++;
@@ -652,7 +726,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
652 list_rotate_left(&c->btree_cache); 726 list_rotate_left(&c->btree_cache);
653 727
654 if (!b->accessed && 728 if (!b->accessed &&
655 !mca_reap(b, NULL, 0)) { 729 !mca_reap(b, 0, false)) {
656 mca_bucket_free(b); 730 mca_bucket_free(b);
657 mca_data_free(b); 731 mca_data_free(b);
658 rw_unlock(true, b); 732 rw_unlock(true, b);
@@ -723,12 +797,9 @@ int bch_btree_cache_alloc(struct cache_set *c)
723{ 797{
724 unsigned i; 798 unsigned i;
725 799
726 /* XXX: doesn't check for errors */
727
728 closure_init_unlocked(&c->gc);
729
730 for (i = 0; i < mca_reserve(c); i++) 800 for (i = 0; i < mca_reserve(c); i++)
731 mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 801 if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
802 return -ENOMEM;
732 803
733 list_splice_init(&c->btree_cache, 804 list_splice_init(&c->btree_cache,
734 &c->btree_cache_freeable); 805 &c->btree_cache_freeable);
@@ -775,52 +846,27 @@ out:
775 return b; 846 return b;
776} 847}
777 848
778static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, 849static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
779 int level, struct closure *cl)
780{ 850{
781 int ret = -ENOMEM; 851 struct btree *b;
782 struct btree *i;
783 852
784 trace_bcache_btree_cache_cannibalize(c); 853 trace_bcache_btree_cache_cannibalize(c);
785 854
786 if (!cl) 855 if (!c->try_harder) {
787 return ERR_PTR(-ENOMEM); 856 c->try_harder = current;
788 857 c->try_harder_start = local_clock();
789 /* 858 } else if (c->try_harder != current)
790 * Trying to free up some memory - i.e. reuse some btree nodes - may 859 return ERR_PTR(-ENOSPC);
791 * require initiating IO to flush the dirty part of the node. If we're
792 * running under generic_make_request(), that IO will never finish and
793 * we would deadlock. Returning -EAGAIN causes the cache lookup code to
794 * punt to workqueue and retry.
795 */
796 if (current->bio_list)
797 return ERR_PTR(-EAGAIN);
798
799 if (c->try_harder && c->try_harder != cl) {
800 closure_wait_event_async(&c->try_wait, cl, !c->try_harder);
801 return ERR_PTR(-EAGAIN);
802 }
803 860
804 c->try_harder = cl; 861 list_for_each_entry_reverse(b, &c->btree_cache, list)
805 c->try_harder_start = local_clock(); 862 if (!mca_reap(b, btree_order(k), false))
806retry: 863 return b;
807 list_for_each_entry_reverse(i, &c->btree_cache, list) {
808 int r = mca_reap(i, cl, btree_order(k));
809 if (!r)
810 return i;
811 if (r != -ENOMEM)
812 ret = r;
813 }
814 864
815 if (ret == -EAGAIN && 865 list_for_each_entry_reverse(b, &c->btree_cache, list)
816 closure_blocking(cl)) { 866 if (!mca_reap(b, btree_order(k), true))
817 mutex_unlock(&c->bucket_lock); 867 return b;
818 closure_sync(cl);
819 mutex_lock(&c->bucket_lock);
820 goto retry;
821 }
822 868
823 return ERR_PTR(ret); 869 return ERR_PTR(-ENOMEM);
824} 870}
825 871
826/* 872/*
@@ -829,20 +875,21 @@ retry:
829 * cannibalize_bucket() will take. This means every time we unlock the root of 875 * cannibalize_bucket() will take. This means every time we unlock the root of
830 * the btree, we need to release this lock if we have it held. 876 * the btree, we need to release this lock if we have it held.
831 */ 877 */
832void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) 878static void bch_cannibalize_unlock(struct cache_set *c)
833{ 879{
834 if (c->try_harder == cl) { 880 if (c->try_harder == current) {
835 bch_time_stats_update(&c->try_harder_time, c->try_harder_start); 881 bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
836 c->try_harder = NULL; 882 c->try_harder = NULL;
837 __closure_wake_up(&c->try_wait); 883 wake_up(&c->try_wait);
838 } 884 }
839} 885}
840 886
841static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, 887static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
842 int level, struct closure *cl)
843{ 888{
844 struct btree *b; 889 struct btree *b;
845 890
891 BUG_ON(current->bio_list);
892
846 lockdep_assert_held(&c->bucket_lock); 893 lockdep_assert_held(&c->bucket_lock);
847 894
848 if (mca_find(c, k)) 895 if (mca_find(c, k))
@@ -852,14 +899,14 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,
852 * the list. Check if there's any freed nodes there: 899 * the list. Check if there's any freed nodes there:
853 */ 900 */
854 list_for_each_entry(b, &c->btree_cache_freeable, list) 901 list_for_each_entry(b, &c->btree_cache_freeable, list)
855 if (!mca_reap(b, NULL, btree_order(k))) 902 if (!mca_reap(b, btree_order(k), false))
856 goto out; 903 goto out;
857 904
858 /* We never free struct btree itself, just the memory that holds the on 905 /* We never free struct btree itself, just the memory that holds the on
859 * disk node. Check the freed list before allocating a new one: 906 * disk node. Check the freed list before allocating a new one:
860 */ 907 */
861 list_for_each_entry(b, &c->btree_cache_freed, list) 908 list_for_each_entry(b, &c->btree_cache_freed, list)
862 if (!mca_reap(b, NULL, 0)) { 909 if (!mca_reap(b, 0, false)) {
863 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); 910 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
864 if (!b->sets[0].data) 911 if (!b->sets[0].data)
865 goto err; 912 goto err;
@@ -884,6 +931,7 @@ out:
884 931
885 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); 932 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
886 b->level = level; 933 b->level = level;
934 b->parent = (void *) ~0UL;
887 935
888 mca_reinit(b); 936 mca_reinit(b);
889 937
@@ -892,7 +940,7 @@ err:
892 if (b) 940 if (b)
893 rw_unlock(true, b); 941 rw_unlock(true, b);
894 942
895 b = mca_cannibalize(c, k, level, cl); 943 b = mca_cannibalize(c, k);
896 if (!IS_ERR(b)) 944 if (!IS_ERR(b))
897 goto out; 945 goto out;
898 946
@@ -903,17 +951,15 @@ err:
903 * bch_btree_node_get - find a btree node in the cache and lock it, reading it 951 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
904 * in from disk if necessary. 952 * in from disk if necessary.
905 * 953 *
906 * If IO is necessary, it uses the closure embedded in struct btree_op to wait; 954 * If IO is necessary and running under generic_make_request, returns -EAGAIN.
907 * if that closure is in non blocking mode, will return -EAGAIN.
908 * 955 *
909 * The btree node will have either a read or a write lock held, depending on 956 * The btree node will have either a read or a write lock held, depending on
910 * level and op->lock. 957 * level and op->lock.
911 */ 958 */
912struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, 959struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
913 int level, struct btree_op *op) 960 int level, bool write)
914{ 961{
915 int i = 0; 962 int i = 0;
916 bool write = level <= op->lock;
917 struct btree *b; 963 struct btree *b;
918 964
919 BUG_ON(level < 0); 965 BUG_ON(level < 0);
@@ -925,7 +971,7 @@ retry:
925 return ERR_PTR(-EAGAIN); 971 return ERR_PTR(-EAGAIN);
926 972
927 mutex_lock(&c->bucket_lock); 973 mutex_lock(&c->bucket_lock);
928 b = mca_alloc(c, k, level, &op->cl); 974 b = mca_alloc(c, k, level);
929 mutex_unlock(&c->bucket_lock); 975 mutex_unlock(&c->bucket_lock);
930 976
931 if (!b) 977 if (!b)
@@ -971,7 +1017,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
971 struct btree *b; 1017 struct btree *b;
972 1018
973 mutex_lock(&c->bucket_lock); 1019 mutex_lock(&c->bucket_lock);
974 b = mca_alloc(c, k, level, NULL); 1020 b = mca_alloc(c, k, level);
975 mutex_unlock(&c->bucket_lock); 1021 mutex_unlock(&c->bucket_lock);
976 1022
977 if (!IS_ERR_OR_NULL(b)) { 1023 if (!IS_ERR_OR_NULL(b)) {
@@ -982,17 +1028,12 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
982 1028
983/* Btree alloc */ 1029/* Btree alloc */
984 1030
985static void btree_node_free(struct btree *b, struct btree_op *op) 1031static void btree_node_free(struct btree *b)
986{ 1032{
987 unsigned i; 1033 unsigned i;
988 1034
989 trace_bcache_btree_node_free(b); 1035 trace_bcache_btree_node_free(b);
990 1036
991 /*
992 * The BUG_ON() in btree_node_get() implies that we must have a write
993 * lock on parent to free or even invalidate a node
994 */
995 BUG_ON(op->lock <= b->level);
996 BUG_ON(b == b->c->root); 1037 BUG_ON(b == b->c->root);
997 1038
998 if (btree_node_dirty(b)) 1039 if (btree_node_dirty(b))
@@ -1015,27 +1056,26 @@ static void btree_node_free(struct btree *b, struct btree_op *op)
1015 mutex_unlock(&b->c->bucket_lock); 1056 mutex_unlock(&b->c->bucket_lock);
1016} 1057}
1017 1058
1018struct btree *bch_btree_node_alloc(struct cache_set *c, int level, 1059struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
1019 struct closure *cl)
1020{ 1060{
1021 BKEY_PADDED(key) k; 1061 BKEY_PADDED(key) k;
1022 struct btree *b = ERR_PTR(-EAGAIN); 1062 struct btree *b = ERR_PTR(-EAGAIN);
1023 1063
1024 mutex_lock(&c->bucket_lock); 1064 mutex_lock(&c->bucket_lock);
1025retry: 1065retry:
1026 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) 1066 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait))
1027 goto err; 1067 goto err;
1028 1068
1069 bkey_put(c, &k.key);
1029 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); 1070 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1030 1071
1031 b = mca_alloc(c, &k.key, level, cl); 1072 b = mca_alloc(c, &k.key, level);
1032 if (IS_ERR(b)) 1073 if (IS_ERR(b))
1033 goto err_free; 1074 goto err_free;
1034 1075
1035 if (!b) { 1076 if (!b) {
1036 cache_bug(c, 1077 cache_bug(c,
1037 "Tried to allocate bucket that was in btree cache"); 1078 "Tried to allocate bucket that was in btree cache");
1038 __bkey_put(c, &k.key);
1039 goto retry; 1079 goto retry;
1040 } 1080 }
1041 1081
@@ -1048,7 +1088,6 @@ retry:
1048 return b; 1088 return b;
1049err_free: 1089err_free:
1050 bch_bucket_free(c, &k.key); 1090 bch_bucket_free(c, &k.key);
1051 __bkey_put(c, &k.key);
1052err: 1091err:
1053 mutex_unlock(&c->bucket_lock); 1092 mutex_unlock(&c->bucket_lock);
1054 1093
@@ -1056,16 +1095,31 @@ err:
1056 return b; 1095 return b;
1057} 1096}
1058 1097
1059static struct btree *btree_node_alloc_replacement(struct btree *b, 1098static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
1060 struct closure *cl)
1061{ 1099{
1062 struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); 1100 struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
1063 if (!IS_ERR_OR_NULL(n)) 1101 if (!IS_ERR_OR_NULL(n))
1064 bch_btree_sort_into(b, n); 1102 bch_btree_sort_into(b, n);
1065 1103
1066 return n; 1104 return n;
1067} 1105}
1068 1106
1107static void make_btree_freeing_key(struct btree *b, struct bkey *k)
1108{
1109 unsigned i;
1110
1111 bkey_copy(k, &b->key);
1112 bkey_copy_key(k, &ZERO_KEY);
1113
1114 for (i = 0; i < KEY_PTRS(k); i++) {
1115 uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1;
1116
1117 SET_PTR_GEN(k, i, g);
1118 }
1119
1120 atomic_inc(&b->c->prio_blocked);
1121}
1122
1069/* Garbage collection */ 1123/* Garbage collection */
1070 1124
1071uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) 1125uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
@@ -1119,12 +1173,10 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1119 1173
1120#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) 1174#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
1121 1175
1122static int btree_gc_mark_node(struct btree *b, unsigned *keys, 1176static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1123 struct gc_stat *gc)
1124{ 1177{
1125 uint8_t stale = 0; 1178 uint8_t stale = 0;
1126 unsigned last_dev = -1; 1179 unsigned keys = 0, good_keys = 0;
1127 struct bcache_device *d = NULL;
1128 struct bkey *k; 1180 struct bkey *k;
1129 struct btree_iter iter; 1181 struct btree_iter iter;
1130 struct bset_tree *t; 1182 struct bset_tree *t;
@@ -1132,27 +1184,17 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
1132 gc->nodes++; 1184 gc->nodes++;
1133 1185
1134 for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1186 for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1135 if (last_dev != KEY_INODE(k)) {
1136 last_dev = KEY_INODE(k);
1137
1138 d = KEY_INODE(k) < b->c->nr_uuids
1139 ? b->c->devices[last_dev]
1140 : NULL;
1141 }
1142
1143 stale = max(stale, btree_mark_key(b, k)); 1187 stale = max(stale, btree_mark_key(b, k));
1188 keys++;
1144 1189
1145 if (bch_ptr_bad(b, k)) 1190 if (bch_ptr_bad(b, k))
1146 continue; 1191 continue;
1147 1192
1148 *keys += bkey_u64s(k);
1149
1150 gc->key_bytes += bkey_u64s(k); 1193 gc->key_bytes += bkey_u64s(k);
1151 gc->nkeys++; 1194 gc->nkeys++;
1195 good_keys++;
1152 1196
1153 gc->data += KEY_SIZE(k); 1197 gc->data += KEY_SIZE(k);
1154 if (KEY_DIRTY(k))
1155 gc->dirty += KEY_SIZE(k);
1156 } 1198 }
1157 1199
1158 for (t = b->sets; t <= &b->sets[b->nsets]; t++) 1200 for (t = b->sets; t <= &b->sets[b->nsets]; t++)
@@ -1161,78 +1203,74 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
1161 bkey_cmp(&b->key, &t->end) < 0, 1203 bkey_cmp(&b->key, &t->end) < 0,
1162 b, "found short btree key in gc"); 1204 b, "found short btree key in gc");
1163 1205
1164 return stale; 1206 if (b->c->gc_always_rewrite)
1165} 1207 return true;
1166
1167static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
1168 struct btree_op *op)
1169{
1170 /*
1171 * We block priorities from being written for the duration of garbage
1172 * collection, so we can't sleep in btree_alloc() ->
1173 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
1174 * our closure.
1175 */
1176 struct btree *n = btree_node_alloc_replacement(b, NULL);
1177
1178 if (!IS_ERR_OR_NULL(n)) {
1179 swap(b, n);
1180 __bkey_put(b->c, &b->key);
1181 1208
1182 memcpy(k->ptr, b->key.ptr, 1209 if (stale > 10)
1183 sizeof(uint64_t) * KEY_PTRS(&b->key)); 1210 return true;
1184 1211
1185 btree_node_free(n, op); 1212 if ((keys - good_keys) * 2 > keys)
1186 up_write(&n->lock); 1213 return true;
1187 }
1188 1214
1189 return b; 1215 return false;
1190} 1216}
1191 1217
1192/* 1218#define GC_MERGE_NODES 4U
1193 * Leaving this at 2 until we've got incremental garbage collection done; it
1194 * could be higher (and has been tested with 4) except that garbage collection
1195 * could take much longer, adversely affecting latency.
1196 */
1197#define GC_MERGE_NODES 2U
1198 1219
1199struct gc_merge_info { 1220struct gc_merge_info {
1200 struct btree *b; 1221 struct btree *b;
1201 struct bkey *k;
1202 unsigned keys; 1222 unsigned keys;
1203}; 1223};
1204 1224
1205static void btree_gc_coalesce(struct btree *b, struct btree_op *op, 1225static int bch_btree_insert_node(struct btree *, struct btree_op *,
1206 struct gc_stat *gc, struct gc_merge_info *r) 1226 struct keylist *, atomic_t *, struct bkey *);
1227
1228static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1229 struct keylist *keylist, struct gc_stat *gc,
1230 struct gc_merge_info *r)
1207{ 1231{
1208 unsigned nodes = 0, keys = 0, blocks; 1232 unsigned i, nodes = 0, keys = 0, blocks;
1209 int i; 1233 struct btree *new_nodes[GC_MERGE_NODES];
1234 struct closure cl;
1235 struct bkey *k;
1236
1237 memset(new_nodes, 0, sizeof(new_nodes));
1238 closure_init_stack(&cl);
1210 1239
1211 while (nodes < GC_MERGE_NODES && r[nodes].b) 1240 while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
1212 keys += r[nodes++].keys; 1241 keys += r[nodes++].keys;
1213 1242
1214 blocks = btree_default_blocks(b->c) * 2 / 3; 1243 blocks = btree_default_blocks(b->c) * 2 / 3;
1215 1244
1216 if (nodes < 2 || 1245 if (nodes < 2 ||
1217 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) 1246 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
1218 return; 1247 return 0;
1219
1220 for (i = nodes - 1; i >= 0; --i) {
1221 if (r[i].b->written)
1222 r[i].b = btree_gc_alloc(r[i].b, r[i].k, op);
1223 1248
1224 if (r[i].b->written) 1249 for (i = 0; i < nodes; i++) {
1225 return; 1250 new_nodes[i] = btree_node_alloc_replacement(r[i].b, false);
1251 if (IS_ERR_OR_NULL(new_nodes[i]))
1252 goto out_nocoalesce;
1226 } 1253 }
1227 1254
1228 for (i = nodes - 1; i > 0; --i) { 1255 for (i = nodes - 1; i > 0; --i) {
1229 struct bset *n1 = r[i].b->sets->data; 1256 struct bset *n1 = new_nodes[i]->sets->data;
1230 struct bset *n2 = r[i - 1].b->sets->data; 1257 struct bset *n2 = new_nodes[i - 1]->sets->data;
1231 struct bkey *k, *last = NULL; 1258 struct bkey *k, *last = NULL;
1232 1259
1233 keys = 0; 1260 keys = 0;
1234 1261
1235 if (i == 1) { 1262 if (i > 1) {
1263 for (k = n2->start;
1264 k < end(n2);
1265 k = bkey_next(k)) {
1266 if (__set_blocks(n1, n1->keys + keys +
1267 bkey_u64s(k), b->c) > blocks)
1268 break;
1269
1270 last = k;
1271 keys += bkey_u64s(k);
1272 }
1273 } else {
1236 /* 1274 /*
1237 * Last node we're not getting rid of - we're getting 1275 * Last node we're not getting rid of - we're getting
1238 * rid of the node at r[0]. Have to try and fit all of 1276 * rid of the node at r[0]. Have to try and fit all of
@@ -1241,37 +1279,27 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
1241 * length keys (shouldn't be possible in practice, 1279 * length keys (shouldn't be possible in practice,
1242 * though) 1280 * though)
1243 */ 1281 */
1244 if (__set_blocks(n1, n1->keys + r->keys, 1282 if (__set_blocks(n1, n1->keys + n2->keys,
1245 b->c) > btree_blocks(r[i].b)) 1283 b->c) > btree_blocks(new_nodes[i]))
1246 return; 1284 goto out_nocoalesce;
1247 1285
1248 keys = n2->keys; 1286 keys = n2->keys;
1287 /* Take the key of the node we're getting rid of */
1249 last = &r->b->key; 1288 last = &r->b->key;
1250 } else 1289 }
1251 for (k = n2->start;
1252 k < end(n2);
1253 k = bkey_next(k)) {
1254 if (__set_blocks(n1, n1->keys + keys +
1255 bkey_u64s(k), b->c) > blocks)
1256 break;
1257
1258 last = k;
1259 keys += bkey_u64s(k);
1260 }
1261 1290
1262 BUG_ON(__set_blocks(n1, n1->keys + keys, 1291 BUG_ON(__set_blocks(n1, n1->keys + keys,
1263 b->c) > btree_blocks(r[i].b)); 1292 b->c) > btree_blocks(new_nodes[i]));
1264 1293
1265 if (last) { 1294 if (last)
1266 bkey_copy_key(&r[i].b->key, last); 1295 bkey_copy_key(&new_nodes[i]->key, last);
1267 bkey_copy_key(r[i].k, last);
1268 }
1269 1296
1270 memcpy(end(n1), 1297 memcpy(end(n1),
1271 n2->start, 1298 n2->start,
1272 (void *) node(n2, keys) - (void *) n2->start); 1299 (void *) node(n2, keys) - (void *) n2->start);
1273 1300
1274 n1->keys += keys; 1301 n1->keys += keys;
1302 r[i].keys = n1->keys;
1275 1303
1276 memmove(n2->start, 1304 memmove(n2->start,
1277 node(n2, keys), 1305 node(n2, keys),
@@ -1279,95 +1307,176 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
1279 1307
1280 n2->keys -= keys; 1308 n2->keys -= keys;
1281 1309
1282 r[i].keys = n1->keys; 1310 if (bch_keylist_realloc(keylist,
1283 r[i - 1].keys = n2->keys; 1311 KEY_PTRS(&new_nodes[i]->key), b->c))
1312 goto out_nocoalesce;
1313
1314 bch_btree_node_write(new_nodes[i], &cl);
1315 bch_keylist_add(keylist, &new_nodes[i]->key);
1284 } 1316 }
1285 1317
1286 btree_node_free(r->b, op); 1318 for (i = 0; i < nodes; i++) {
1287 up_write(&r->b->lock); 1319 if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c))
1320 goto out_nocoalesce;
1288 1321
1289 trace_bcache_btree_gc_coalesce(nodes); 1322 make_btree_freeing_key(r[i].b, keylist->top);
1323 bch_keylist_push(keylist);
1324 }
1325
1326 /* We emptied out this node */
1327 BUG_ON(new_nodes[0]->sets->data->keys);
1328 btree_node_free(new_nodes[0]);
1329 rw_unlock(true, new_nodes[0]);
1330
1331 closure_sync(&cl);
1332
1333 for (i = 0; i < nodes; i++) {
1334 btree_node_free(r[i].b);
1335 rw_unlock(true, r[i].b);
1336
1337 r[i].b = new_nodes[i];
1338 }
1339
1340 bch_btree_insert_node(b, op, keylist, NULL, NULL);
1341 BUG_ON(!bch_keylist_empty(keylist));
1342
1343 memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
1344 r[nodes - 1].b = ERR_PTR(-EINTR);
1290 1345
1346 trace_bcache_btree_gc_coalesce(nodes);
1291 gc->nodes--; 1347 gc->nodes--;
1292 nodes--;
1293 1348
1294 memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); 1349 /* Invalidated our iterator */
1295 memset(&r[nodes], 0, sizeof(struct gc_merge_info)); 1350 return -EINTR;
1351
1352out_nocoalesce:
1353 closure_sync(&cl);
1354
1355 while ((k = bch_keylist_pop(keylist)))
1356 if (!bkey_cmp(k, &ZERO_KEY))
1357 atomic_dec(&b->c->prio_blocked);
1358
1359 for (i = 0; i < nodes; i++)
1360 if (!IS_ERR_OR_NULL(new_nodes[i])) {
1361 btree_node_free(new_nodes[i]);
1362 rw_unlock(true, new_nodes[i]);
1363 }
1364 return 0;
1296} 1365}
1297 1366
1298static int btree_gc_recurse(struct btree *b, struct btree_op *op, 1367static unsigned btree_gc_count_keys(struct btree *b)
1299 struct closure *writes, struct gc_stat *gc)
1300{ 1368{
1301 void write(struct btree *r) 1369 struct bkey *k;
1302 { 1370 struct btree_iter iter;
1303 if (!r->written) 1371 unsigned ret = 0;
1304 bch_btree_node_write(r, &op->cl);
1305 else if (btree_node_dirty(r))
1306 bch_btree_node_write(r, writes);
1307 1372
1308 up_write(&r->lock); 1373 for_each_key_filter(b, k, &iter, bch_ptr_bad)
1309 } 1374 ret += bkey_u64s(k);
1375
1376 return ret;
1377}
1310 1378
1311 int ret = 0, stale; 1379static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1380 struct closure *writes, struct gc_stat *gc)
1381{
1312 unsigned i; 1382 unsigned i;
1383 int ret = 0;
1384 bool should_rewrite;
1385 struct btree *n;
1386 struct bkey *k;
1387 struct keylist keys;
1388 struct btree_iter iter;
1313 struct gc_merge_info r[GC_MERGE_NODES]; 1389 struct gc_merge_info r[GC_MERGE_NODES];
1390 struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
1314 1391
1315 memset(r, 0, sizeof(r)); 1392 bch_keylist_init(&keys);
1393 bch_btree_iter_init(b, &iter, &b->c->gc_done);
1316 1394
1317 while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { 1395 for (i = 0; i < GC_MERGE_NODES; i++)
1318 r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); 1396 r[i].b = ERR_PTR(-EINTR);
1319 1397
1320 if (IS_ERR(r->b)) { 1398 while (1) {
1321 ret = PTR_ERR(r->b); 1399 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
1322 break; 1400 if (k) {
1401 r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
1402 if (IS_ERR(r->b)) {
1403 ret = PTR_ERR(r->b);
1404 break;
1405 }
1406
1407 r->keys = btree_gc_count_keys(r->b);
1408
1409 ret = btree_gc_coalesce(b, op, &keys, gc, r);
1410 if (ret)
1411 break;
1323 } 1412 }
1324 1413
1325 r->keys = 0; 1414 if (!last->b)
1326 stale = btree_gc_mark_node(r->b, &r->keys, gc); 1415 break;
1327 1416
1328 if (!b->written && 1417 if (!IS_ERR(last->b)) {
1329 (r->b->level || stale > 10 || 1418 should_rewrite = btree_gc_mark_node(last->b, gc);
1330 b->c->gc_always_rewrite)) 1419 if (should_rewrite) {
1331 r->b = btree_gc_alloc(r->b, r->k, op); 1420 n = btree_node_alloc_replacement(last->b,
1421 false);
1332 1422
1333 if (r->b->level) 1423 if (!IS_ERR_OR_NULL(n)) {
1334 ret = btree_gc_recurse(r->b, op, writes, gc); 1424 bch_btree_node_write_sync(n);
1425 bch_keylist_add(&keys, &n->key);
1335 1426
1336 if (ret) { 1427 make_btree_freeing_key(last->b,
1337 write(r->b); 1428 keys.top);
1338 break; 1429 bch_keylist_push(&keys);
1339 } 1430
1431 btree_node_free(last->b);
1432
1433 bch_btree_insert_node(b, op, &keys,
1434 NULL, NULL);
1435 BUG_ON(!bch_keylist_empty(&keys));
1340 1436
1341 bkey_copy_key(&b->c->gc_done, r->k); 1437 rw_unlock(true, last->b);
1438 last->b = n;
1342 1439
1343 if (!b->written) 1440 /* Invalidated our iterator */
1344 btree_gc_coalesce(b, op, gc, r); 1441 ret = -EINTR;
1442 break;
1443 }
1444 }
1345 1445
1346 if (r[GC_MERGE_NODES - 1].b) 1446 if (last->b->level) {
1347 write(r[GC_MERGE_NODES - 1].b); 1447 ret = btree_gc_recurse(last->b, op, writes, gc);
1448 if (ret)
1449 break;
1450 }
1348 1451
1349 memmove(&r[1], &r[0], 1452 bkey_copy_key(&b->c->gc_done, &last->b->key);
1350 sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); 1453
1454 /*
1455 * Must flush leaf nodes before gc ends, since replace
1456 * operations aren't journalled
1457 */
1458 if (btree_node_dirty(last->b))
1459 bch_btree_node_write(last->b, writes);
1460 rw_unlock(true, last->b);
1461 }
1462
1463 memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
1464 r->b = NULL;
1351 1465
1352 /* When we've got incremental GC working, we'll want to do
1353 * if (should_resched())
1354 * return -EAGAIN;
1355 */
1356 cond_resched();
1357#if 0
1358 if (need_resched()) { 1466 if (need_resched()) {
1359 ret = -EAGAIN; 1467 ret = -EAGAIN;
1360 break; 1468 break;
1361 } 1469 }
1362#endif
1363 } 1470 }
1364 1471
1365 for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) 1472 for (i = 0; i < GC_MERGE_NODES; i++)
1366 write(r[i].b); 1473 if (!IS_ERR_OR_NULL(r[i].b)) {
1474 if (btree_node_dirty(r[i].b))
1475 bch_btree_node_write(r[i].b, writes);
1476 rw_unlock(true, r[i].b);
1477 }
1367 1478
1368 /* Might have freed some children, must remove their keys */ 1479 bch_keylist_free(&keys);
1369 if (!b->written)
1370 bch_btree_sort(b);
1371 1480
1372 return ret; 1481 return ret;
1373} 1482}
@@ -1376,29 +1485,31 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1376 struct closure *writes, struct gc_stat *gc) 1485 struct closure *writes, struct gc_stat *gc)
1377{ 1486{
1378 struct btree *n = NULL; 1487 struct btree *n = NULL;
1379 unsigned keys = 0; 1488 int ret = 0;
1380 int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); 1489 bool should_rewrite;
1381
1382 if (b->level || stale > 10)
1383 n = btree_node_alloc_replacement(b, NULL);
1384 1490
1385 if (!IS_ERR_OR_NULL(n)) 1491 should_rewrite = btree_gc_mark_node(b, gc);
1386 swap(b, n); 1492 if (should_rewrite) {
1493 n = btree_node_alloc_replacement(b, false);
1387 1494
1388 if (b->level) 1495 if (!IS_ERR_OR_NULL(n)) {
1389 ret = btree_gc_recurse(b, op, writes, gc); 1496 bch_btree_node_write_sync(n);
1497 bch_btree_set_root(n);
1498 btree_node_free(b);
1499 rw_unlock(true, n);
1390 1500
1391 if (!b->written || btree_node_dirty(b)) { 1501 return -EINTR;
1392 bch_btree_node_write(b, n ? &op->cl : NULL); 1502 }
1393 } 1503 }
1394 1504
1395 if (!IS_ERR_OR_NULL(n)) { 1505 if (b->level) {
1396 closure_sync(&op->cl); 1506 ret = btree_gc_recurse(b, op, writes, gc);
1397 bch_btree_set_root(b); 1507 if (ret)
1398 btree_node_free(n, op); 1508 return ret;
1399 rw_unlock(true, b);
1400 } 1509 }
1401 1510
1511 bkey_copy_key(&b->c->gc_done, &b->key);
1512
1402 return ret; 1513 return ret;
1403} 1514}
1404 1515
@@ -1479,9 +1590,8 @@ size_t bch_btree_gc_finish(struct cache_set *c)
1479 return available; 1590 return available;
1480} 1591}
1481 1592
1482static void bch_btree_gc(struct closure *cl) 1593static void bch_btree_gc(struct cache_set *c)
1483{ 1594{
1484 struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
1485 int ret; 1595 int ret;
1486 unsigned long available; 1596 unsigned long available;
1487 struct gc_stat stats; 1597 struct gc_stat stats;
@@ -1493,47 +1603,73 @@ static void bch_btree_gc(struct closure *cl)
1493 1603
1494 memset(&stats, 0, sizeof(struct gc_stat)); 1604 memset(&stats, 0, sizeof(struct gc_stat));
1495 closure_init_stack(&writes); 1605 closure_init_stack(&writes);
1496 bch_btree_op_init_stack(&op); 1606 bch_btree_op_init(&op, SHRT_MAX);
1497 op.lock = SHRT_MAX;
1498 1607
1499 btree_gc_start(c); 1608 btree_gc_start(c);
1500 1609
1501 atomic_inc(&c->prio_blocked); 1610 do {
1502 1611 ret = btree_root(gc_root, c, &op, &writes, &stats);
1503 ret = btree_root(gc_root, c, &op, &writes, &stats); 1612 closure_sync(&writes);
1504 closure_sync(&op.cl);
1505 closure_sync(&writes);
1506
1507 if (ret) {
1508 pr_warn("gc failed!");
1509 continue_at(cl, bch_btree_gc, bch_gc_wq);
1510 }
1511 1613
1512 /* Possibly wait for new UUIDs or whatever to hit disk */ 1614 if (ret && ret != -EAGAIN)
1513 bch_journal_meta(c, &op.cl); 1615 pr_warn("gc failed!");
1514 closure_sync(&op.cl); 1616 } while (ret);
1515 1617
1516 available = bch_btree_gc_finish(c); 1618 available = bch_btree_gc_finish(c);
1517
1518 atomic_dec(&c->prio_blocked);
1519 wake_up_allocators(c); 1619 wake_up_allocators(c);
1520 1620
1521 bch_time_stats_update(&c->btree_gc_time, start_time); 1621 bch_time_stats_update(&c->btree_gc_time, start_time);
1522 1622
1523 stats.key_bytes *= sizeof(uint64_t); 1623 stats.key_bytes *= sizeof(uint64_t);
1524 stats.dirty <<= 9;
1525 stats.data <<= 9; 1624 stats.data <<= 9;
1526 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; 1625 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
1527 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1626 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1528 1627
1529 trace_bcache_gc_end(c); 1628 trace_bcache_gc_end(c);
1530 1629
1531 continue_at(cl, bch_moving_gc, bch_gc_wq); 1630 bch_moving_gc(c);
1631}
1632
1633static int bch_gc_thread(void *arg)
1634{
1635 struct cache_set *c = arg;
1636 struct cache *ca;
1637 unsigned i;
1638
1639 while (1) {
1640again:
1641 bch_btree_gc(c);
1642
1643 set_current_state(TASK_INTERRUPTIBLE);
1644 if (kthread_should_stop())
1645 break;
1646
1647 mutex_lock(&c->bucket_lock);
1648
1649 for_each_cache(ca, c, i)
1650 if (ca->invalidate_needs_gc) {
1651 mutex_unlock(&c->bucket_lock);
1652 set_current_state(TASK_RUNNING);
1653 goto again;
1654 }
1655
1656 mutex_unlock(&c->bucket_lock);
1657
1658 try_to_freeze();
1659 schedule();
1660 }
1661
1662 return 0;
1532} 1663}
1533 1664
1534void bch_queue_gc(struct cache_set *c) 1665int bch_gc_thread_start(struct cache_set *c)
1535{ 1666{
1536 closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); 1667 c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
1668 if (IS_ERR(c->gc_thread))
1669 return PTR_ERR(c->gc_thread);
1670
1671 set_task_state(c->gc_thread, TASK_INTERRUPTIBLE);
1672 return 0;
1537} 1673}
1538 1674
1539/* Initial partial gc */ 1675/* Initial partial gc */
@@ -1541,9 +1677,9 @@ void bch_queue_gc(struct cache_set *c)
1541static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, 1677static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1542 unsigned long **seen) 1678 unsigned long **seen)
1543{ 1679{
1544 int ret; 1680 int ret = 0;
1545 unsigned i; 1681 unsigned i;
1546 struct bkey *k; 1682 struct bkey *k, *p = NULL;
1547 struct bucket *g; 1683 struct bucket *g;
1548 struct btree_iter iter; 1684 struct btree_iter iter;
1549 1685
@@ -1570,31 +1706,32 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1570 } 1706 }
1571 1707
1572 if (b->level) { 1708 if (b->level) {
1573 k = bch_next_recurse_key(b, &ZERO_KEY); 1709 bch_btree_iter_init(b, &iter, NULL);
1574 1710
1575 while (k) { 1711 do {
1576 struct bkey *p = bch_next_recurse_key(b, k); 1712 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
1577 if (p) 1713 if (k)
1578 btree_node_prefetch(b->c, p, b->level - 1); 1714 btree_node_prefetch(b->c, k, b->level - 1);
1579 1715
1580 ret = btree(check_recurse, k, b, op, seen); 1716 if (p)
1581 if (ret) 1717 ret = btree(check_recurse, p, b, op, seen);
1582 return ret;
1583 1718
1584 k = p; 1719 p = k;
1585 } 1720 } while (p && !ret);
1586 } 1721 }
1587 1722
1588 return 0; 1723 return 0;
1589} 1724}
1590 1725
1591int bch_btree_check(struct cache_set *c, struct btree_op *op) 1726int bch_btree_check(struct cache_set *c)
1592{ 1727{
1593 int ret = -ENOMEM; 1728 int ret = -ENOMEM;
1594 unsigned i; 1729 unsigned i;
1595 unsigned long *seen[MAX_CACHES_PER_SET]; 1730 unsigned long *seen[MAX_CACHES_PER_SET];
1731 struct btree_op op;
1596 1732
1597 memset(seen, 0, sizeof(seen)); 1733 memset(seen, 0, sizeof(seen));
1734 bch_btree_op_init(&op, SHRT_MAX);
1598 1735
1599 for (i = 0; c->cache[i]; i++) { 1736 for (i = 0; c->cache[i]; i++) {
1600 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); 1737 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
@@ -1606,7 +1743,7 @@ int bch_btree_check(struct cache_set *c, struct btree_op *op)
1606 memset(seen[i], 0xFF, n); 1743 memset(seen[i], 0xFF, n);
1607 } 1744 }
1608 1745
1609 ret = btree_root(check_recurse, c, op, seen); 1746 ret = btree_root(check_recurse, c, &op, seen);
1610err: 1747err:
1611 for (i = 0; i < MAX_CACHES_PER_SET; i++) 1748 for (i = 0; i < MAX_CACHES_PER_SET; i++)
1612 kfree(seen[i]); 1749 kfree(seen[i]);
@@ -1628,10 +1765,9 @@ static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
1628 bch_bset_fix_lookup_table(b, where); 1765 bch_bset_fix_lookup_table(b, where);
1629} 1766}
1630 1767
1631static bool fix_overlapping_extents(struct btree *b, 1768static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
1632 struct bkey *insert,
1633 struct btree_iter *iter, 1769 struct btree_iter *iter,
1634 struct btree_op *op) 1770 struct bkey *replace_key)
1635{ 1771{
1636 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) 1772 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
1637 { 1773 {
@@ -1659,39 +1795,38 @@ static bool fix_overlapping_extents(struct btree *b,
1659 * We might overlap with 0 size extents; we can't skip these 1795 * We might overlap with 0 size extents; we can't skip these
1660 * because if they're in the set we're inserting to we have to 1796 * because if they're in the set we're inserting to we have to
1661 * adjust them so they don't overlap with the key we're 1797 * adjust them so they don't overlap with the key we're
1662 * inserting. But we don't want to check them for BTREE_REPLACE 1798 * inserting. But we don't want to check them for replace
1663 * operations. 1799 * operations.
1664 */ 1800 */
1665 1801
1666 if (op->type == BTREE_REPLACE && 1802 if (replace_key && KEY_SIZE(k)) {
1667 KEY_SIZE(k)) {
1668 /* 1803 /*
1669 * k might have been split since we inserted/found the 1804 * k might have been split since we inserted/found the
1670 * key we're replacing 1805 * key we're replacing
1671 */ 1806 */
1672 unsigned i; 1807 unsigned i;
1673 uint64_t offset = KEY_START(k) - 1808 uint64_t offset = KEY_START(k) -
1674 KEY_START(&op->replace); 1809 KEY_START(replace_key);
1675 1810
1676 /* But it must be a subset of the replace key */ 1811 /* But it must be a subset of the replace key */
1677 if (KEY_START(k) < KEY_START(&op->replace) || 1812 if (KEY_START(k) < KEY_START(replace_key) ||
1678 KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) 1813 KEY_OFFSET(k) > KEY_OFFSET(replace_key))
1679 goto check_failed; 1814 goto check_failed;
1680 1815
1681 /* We didn't find a key that we were supposed to */ 1816 /* We didn't find a key that we were supposed to */
1682 if (KEY_START(k) > KEY_START(insert) + sectors_found) 1817 if (KEY_START(k) > KEY_START(insert) + sectors_found)
1683 goto check_failed; 1818 goto check_failed;
1684 1819
1685 if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) 1820 if (KEY_PTRS(replace_key) != KEY_PTRS(k))
1686 goto check_failed; 1821 goto check_failed;
1687 1822
1688 /* skip past gen */ 1823 /* skip past gen */
1689 offset <<= 8; 1824 offset <<= 8;
1690 1825
1691 BUG_ON(!KEY_PTRS(&op->replace)); 1826 BUG_ON(!KEY_PTRS(replace_key));
1692 1827
1693 for (i = 0; i < KEY_PTRS(&op->replace); i++) 1828 for (i = 0; i < KEY_PTRS(replace_key); i++)
1694 if (k->ptr[i] != op->replace.ptr[i] + offset) 1829 if (k->ptr[i] != replace_key->ptr[i] + offset)
1695 goto check_failed; 1830 goto check_failed;
1696 1831
1697 sectors_found = KEY_OFFSET(k) - KEY_START(insert); 1832 sectors_found = KEY_OFFSET(k) - KEY_START(insert);
@@ -1742,6 +1877,9 @@ static bool fix_overlapping_extents(struct btree *b,
1742 if (bkey_cmp(insert, k) < 0) { 1877 if (bkey_cmp(insert, k) < 0) {
1743 bch_cut_front(insert, k); 1878 bch_cut_front(insert, k);
1744 } else { 1879 } else {
1880 if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
1881 old_offset = KEY_START(insert);
1882
1745 if (bkey_written(b, k) && 1883 if (bkey_written(b, k) &&
1746 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { 1884 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
1747 /* 1885 /*
@@ -1759,9 +1897,8 @@ static bool fix_overlapping_extents(struct btree *b,
1759 } 1897 }
1760 1898
1761check_failed: 1899check_failed:
1762 if (op->type == BTREE_REPLACE) { 1900 if (replace_key) {
1763 if (!sectors_found) { 1901 if (!sectors_found) {
1764 op->insert_collision = true;
1765 return true; 1902 return true;
1766 } else if (sectors_found < KEY_SIZE(insert)) { 1903 } else if (sectors_found < KEY_SIZE(insert)) {
1767 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - 1904 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
@@ -1774,7 +1911,7 @@ check_failed:
1774} 1911}
1775 1912
1776static bool btree_insert_key(struct btree *b, struct btree_op *op, 1913static bool btree_insert_key(struct btree *b, struct btree_op *op,
1777 struct bkey *k) 1914 struct bkey *k, struct bkey *replace_key)
1778{ 1915{
1779 struct bset *i = b->sets[b->nsets].data; 1916 struct bset *i = b->sets[b->nsets].data;
1780 struct bkey *m, *prev; 1917 struct bkey *m, *prev;
@@ -1786,22 +1923,23 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1786 1923
1787 if (!b->level) { 1924 if (!b->level) {
1788 struct btree_iter iter; 1925 struct btree_iter iter;
1789 struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0);
1790 1926
1791 /* 1927 /*
1792 * bset_search() returns the first key that is strictly greater 1928 * bset_search() returns the first key that is strictly greater
1793 * than the search key - but for back merging, we want to find 1929 * than the search key - but for back merging, we want to find
1794 * the first key that is greater than or equal to KEY_START(k) - 1930 * the previous key.
1795 * unless KEY_START(k) is 0.
1796 */ 1931 */
1797 if (KEY_OFFSET(&search))
1798 SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1);
1799
1800 prev = NULL; 1932 prev = NULL;
1801 m = bch_btree_iter_init(b, &iter, &search); 1933 m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
1802 1934
1803 if (fix_overlapping_extents(b, k, &iter, op)) 1935 if (fix_overlapping_extents(b, k, &iter, replace_key)) {
1936 op->insert_collision = true;
1804 return false; 1937 return false;
1938 }
1939
1940 if (KEY_DIRTY(k))
1941 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1942 KEY_START(k), KEY_SIZE(k));
1805 1943
1806 while (m != end(i) && 1944 while (m != end(i) &&
1807 bkey_cmp(k, &START_KEY(m)) > 0) 1945 bkey_cmp(k, &START_KEY(m)) > 0)
@@ -1825,84 +1963,80 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
1825 if (m != end(i) && 1963 if (m != end(i) &&
1826 bch_bkey_try_merge(b, k, m)) 1964 bch_bkey_try_merge(b, k, m))
1827 goto copy; 1965 goto copy;
1828 } else 1966 } else {
1967 BUG_ON(replace_key);
1829 m = bch_bset_search(b, &b->sets[b->nsets], k); 1968 m = bch_bset_search(b, &b->sets[b->nsets], k);
1969 }
1830 1970
1831insert: shift_keys(b, m, k); 1971insert: shift_keys(b, m, k);
1832copy: bkey_copy(m, k); 1972copy: bkey_copy(m, k);
1833merged: 1973merged:
1834 if (KEY_DIRTY(k)) 1974 bch_check_keys(b, "%u for %s", status,
1835 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 1975 replace_key ? "replace" : "insert");
1836 KEY_START(k), KEY_SIZE(k));
1837
1838 bch_check_keys(b, "%u for %s", status, op_type(op));
1839 1976
1840 if (b->level && !KEY_OFFSET(k)) 1977 if (b->level && !KEY_OFFSET(k))
1841 btree_current_write(b)->prio_blocked++; 1978 btree_current_write(b)->prio_blocked++;
1842 1979
1843 trace_bcache_btree_insert_key(b, k, op->type, status); 1980 trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
1844 1981
1845 return true; 1982 return true;
1846} 1983}
1847 1984
1848static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) 1985static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
1986 struct keylist *insert_keys,
1987 struct bkey *replace_key)
1849{ 1988{
1850 bool ret = false; 1989 bool ret = false;
1851 struct bkey *k; 1990 int oldsize = bch_count_data(b);
1852 unsigned oldsize = bch_count_data(b);
1853
1854 while ((k = bch_keylist_pop(&op->keys))) {
1855 bkey_put(b->c, k, b->level);
1856 ret |= btree_insert_key(b, op, k);
1857 }
1858
1859 BUG_ON(bch_count_data(b) < oldsize);
1860 return ret;
1861}
1862 1991
1863bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, 1992 while (!bch_keylist_empty(insert_keys)) {
1864 struct bio *bio) 1993 struct bset *i = write_block(b);
1865{ 1994 struct bkey *k = insert_keys->keys;
1866 bool ret = false;
1867 uint64_t btree_ptr = b->key.ptr[0];
1868 unsigned long seq = b->seq;
1869 BKEY_PADDED(k) tmp;
1870 1995
1871 rw_unlock(false, b); 1996 if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
1872 rw_lock(true, b, b->level); 1997 > btree_blocks(b))
1998 break;
1873 1999
1874 if (b->key.ptr[0] != btree_ptr || 2000 if (bkey_cmp(k, &b->key) <= 0) {
1875 b->seq != seq + 1 || 2001 if (!b->level)
1876 should_split(b)) 2002 bkey_put(b->c, k);
1877 goto out;
1878 2003
1879 op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); 2004 ret |= btree_insert_key(b, op, k, replace_key);
2005 bch_keylist_pop_front(insert_keys);
2006 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
2007 BKEY_PADDED(key) temp;
2008 bkey_copy(&temp.key, insert_keys->keys);
1880 2009
1881 SET_KEY_PTRS(&op->replace, 1); 2010 bch_cut_back(&b->key, &temp.key);
1882 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); 2011 bch_cut_front(&b->key, insert_keys->keys);
1883 2012
1884 SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); 2013 ret |= btree_insert_key(b, op, &temp.key, replace_key);
2014 break;
2015 } else {
2016 break;
2017 }
2018 }
1885 2019
1886 bkey_copy(&tmp.k, &op->replace); 2020 BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
1887 2021
1888 BUG_ON(op->type != BTREE_INSERT); 2022 BUG_ON(bch_count_data(b) < oldsize);
1889 BUG_ON(!btree_insert_key(b, op, &tmp.k));
1890 ret = true;
1891out:
1892 downgrade_write(&b->lock);
1893 return ret; 2023 return ret;
1894} 2024}
1895 2025
1896static int btree_split(struct btree *b, struct btree_op *op) 2026static int btree_split(struct btree *b, struct btree_op *op,
2027 struct keylist *insert_keys,
2028 struct bkey *replace_key)
1897{ 2029{
1898 bool split, root = b == b->c->root; 2030 bool split;
1899 struct btree *n1, *n2 = NULL, *n3 = NULL; 2031 struct btree *n1, *n2 = NULL, *n3 = NULL;
1900 uint64_t start_time = local_clock(); 2032 uint64_t start_time = local_clock();
2033 struct closure cl;
2034 struct keylist parent_keys;
1901 2035
1902 if (b->level) 2036 closure_init_stack(&cl);
1903 set_closure_blocking(&op->cl); 2037 bch_keylist_init(&parent_keys);
1904 2038
1905 n1 = btree_node_alloc_replacement(b, &op->cl); 2039 n1 = btree_node_alloc_replacement(b, true);
1906 if (IS_ERR(n1)) 2040 if (IS_ERR(n1))
1907 goto err; 2041 goto err;
1908 2042
@@ -1913,19 +2047,20 @@ static int btree_split(struct btree *b, struct btree_op *op)
1913 2047
1914 trace_bcache_btree_node_split(b, n1->sets[0].data->keys); 2048 trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
1915 2049
1916 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); 2050 n2 = bch_btree_node_alloc(b->c, b->level, true);
1917 if (IS_ERR(n2)) 2051 if (IS_ERR(n2))
1918 goto err_free1; 2052 goto err_free1;
1919 2053
1920 if (root) { 2054 if (!b->parent) {
1921 n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); 2055 n3 = bch_btree_node_alloc(b->c, b->level + 1, true);
1922 if (IS_ERR(n3)) 2056 if (IS_ERR(n3))
1923 goto err_free2; 2057 goto err_free2;
1924 } 2058 }
1925 2059
1926 bch_btree_insert_keys(n1, op); 2060 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
1927 2061
1928 /* Has to be a linear search because we don't have an auxiliary 2062 /*
2063 * Has to be a linear search because we don't have an auxiliary
1929 * search tree yet 2064 * search tree yet
1930 */ 2065 */
1931 2066
@@ -1944,60 +2079,57 @@ static int btree_split(struct btree *b, struct btree_op *op)
1944 2079
1945 bkey_copy_key(&n2->key, &b->key); 2080 bkey_copy_key(&n2->key, &b->key);
1946 2081
1947 bch_keylist_add(&op->keys, &n2->key); 2082 bch_keylist_add(&parent_keys, &n2->key);
1948 bch_btree_node_write(n2, &op->cl); 2083 bch_btree_node_write(n2, &cl);
1949 rw_unlock(true, n2); 2084 rw_unlock(true, n2);
1950 } else { 2085 } else {
1951 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); 2086 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
1952 2087
1953 bch_btree_insert_keys(n1, op); 2088 bch_btree_insert_keys(n1, op, insert_keys, replace_key);
1954 } 2089 }
1955 2090
1956 bch_keylist_add(&op->keys, &n1->key); 2091 bch_keylist_add(&parent_keys, &n1->key);
1957 bch_btree_node_write(n1, &op->cl); 2092 bch_btree_node_write(n1, &cl);
1958 2093
1959 if (n3) { 2094 if (n3) {
2095 /* Depth increases, make a new root */
1960 bkey_copy_key(&n3->key, &MAX_KEY); 2096 bkey_copy_key(&n3->key, &MAX_KEY);
1961 bch_btree_insert_keys(n3, op); 2097 bch_btree_insert_keys(n3, op, &parent_keys, NULL);
1962 bch_btree_node_write(n3, &op->cl); 2098 bch_btree_node_write(n3, &cl);
1963 2099
1964 closure_sync(&op->cl); 2100 closure_sync(&cl);
1965 bch_btree_set_root(n3); 2101 bch_btree_set_root(n3);
1966 rw_unlock(true, n3); 2102 rw_unlock(true, n3);
1967 } else if (root) {
1968 op->keys.top = op->keys.bottom;
1969 closure_sync(&op->cl);
1970 bch_btree_set_root(n1);
1971 } else {
1972 unsigned i;
1973 2103
1974 bkey_copy(op->keys.top, &b->key); 2104 btree_node_free(b);
1975 bkey_copy_key(op->keys.top, &ZERO_KEY); 2105 } else if (!b->parent) {
2106 /* Root filled up but didn't need to be split */
2107 closure_sync(&cl);
2108 bch_btree_set_root(n1);
1976 2109
1977 for (i = 0; i < KEY_PTRS(&b->key); i++) { 2110 btree_node_free(b);
1978 uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; 2111 } else {
2112 /* Split a non root node */
2113 closure_sync(&cl);
2114 make_btree_freeing_key(b, parent_keys.top);
2115 bch_keylist_push(&parent_keys);
1979 2116
1980 SET_PTR_GEN(op->keys.top, i, g); 2117 btree_node_free(b);
1981 }
1982 2118
1983 bch_keylist_push(&op->keys); 2119 bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
1984 closure_sync(&op->cl); 2120 BUG_ON(!bch_keylist_empty(&parent_keys));
1985 atomic_inc(&b->c->prio_blocked);
1986 } 2121 }
1987 2122
1988 rw_unlock(true, n1); 2123 rw_unlock(true, n1);
1989 btree_node_free(b, op);
1990 2124
1991 bch_time_stats_update(&b->c->btree_split_time, start_time); 2125 bch_time_stats_update(&b->c->btree_split_time, start_time);
1992 2126
1993 return 0; 2127 return 0;
1994err_free2: 2128err_free2:
1995 __bkey_put(n2->c, &n2->key); 2129 btree_node_free(n2);
1996 btree_node_free(n2, op);
1997 rw_unlock(true, n2); 2130 rw_unlock(true, n2);
1998err_free1: 2131err_free1:
1999 __bkey_put(n1->c, &n1->key); 2132 btree_node_free(n1);
2000 btree_node_free(n1, op);
2001 rw_unlock(true, n1); 2133 rw_unlock(true, n1);
2002err: 2134err:
2003 if (n3 == ERR_PTR(-EAGAIN) || 2135 if (n3 == ERR_PTR(-EAGAIN) ||
@@ -2009,116 +2141,126 @@ err:
2009 return -ENOMEM; 2141 return -ENOMEM;
2010} 2142}
2011 2143
2012static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, 2144static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2013 struct keylist *stack_keys) 2145 struct keylist *insert_keys,
2146 atomic_t *journal_ref,
2147 struct bkey *replace_key)
2014{ 2148{
2015 if (b->level) { 2149 BUG_ON(b->level && replace_key);
2016 int ret;
2017 struct bkey *insert = op->keys.bottom;
2018 struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
2019
2020 if (!k) {
2021 btree_bug(b, "no key to recurse on at level %i/%i",
2022 b->level, b->c->root->level);
2023 2150
2024 op->keys.top = op->keys.bottom; 2151 if (should_split(b)) {
2025 return -EIO; 2152 if (current->bio_list) {
2153 op->lock = b->c->root->level + 1;
2154 return -EAGAIN;
2155 } else if (op->lock <= b->c->root->level) {
2156 op->lock = b->c->root->level + 1;
2157 return -EINTR;
2158 } else {
2159 /* Invalidated all iterators */
2160 return btree_split(b, op, insert_keys, replace_key) ?:
2161 -EINTR;
2026 } 2162 }
2163 } else {
2164 BUG_ON(write_block(b) != b->sets[b->nsets].data);
2027 2165
2028 if (bkey_cmp(insert, k) > 0) { 2166 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2029 unsigned i; 2167 if (!b->level)
2030 2168 bch_btree_leaf_dirty(b, journal_ref);
2031 if (op->type == BTREE_REPLACE) { 2169 else
2032 __bkey_put(b->c, insert); 2170 bch_btree_node_write_sync(b);
2033 op->keys.top = op->keys.bottom; 2171 }
2034 op->insert_collision = true;
2035 return 0;
2036 }
2037 2172
2038 for (i = 0; i < KEY_PTRS(insert); i++) 2173 return 0;
2039 atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); 2174 }
2175}
2040 2176
2041 bkey_copy(stack_keys->top, insert); 2177int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
2178 struct bkey *check_key)
2179{
2180 int ret = -EINTR;
2181 uint64_t btree_ptr = b->key.ptr[0];
2182 unsigned long seq = b->seq;
2183 struct keylist insert;
2184 bool upgrade = op->lock == -1;
2042 2185
2043 bch_cut_back(k, insert); 2186 bch_keylist_init(&insert);
2044 bch_cut_front(k, stack_keys->top);
2045 2187
2046 bch_keylist_push(stack_keys); 2188 if (upgrade) {
2047 } 2189 rw_unlock(false, b);
2190 rw_lock(true, b, b->level);
2048 2191
2049 ret = btree(insert_recurse, k, b, op, stack_keys); 2192 if (b->key.ptr[0] != btree_ptr ||
2050 if (ret) 2193 b->seq != seq + 1)
2051 return ret; 2194 goto out;
2052 } 2195 }
2053 2196
2054 if (!bch_keylist_empty(&op->keys)) { 2197 SET_KEY_PTRS(check_key, 1);
2055 if (should_split(b)) { 2198 get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
2056 if (op->lock <= b->c->root->level) {
2057 BUG_ON(b->level);
2058 op->lock = b->c->root->level + 1;
2059 return -EINTR;
2060 }
2061 return btree_split(b, op);
2062 }
2063 2199
2064 BUG_ON(write_block(b) != b->sets[b->nsets].data); 2200 SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
2065 2201
2066 if (bch_btree_insert_keys(b, op)) { 2202 bch_keylist_add(&insert, check_key);
2067 if (!b->level)
2068 bch_btree_leaf_dirty(b, op);
2069 else
2070 bch_btree_node_write(b, &op->cl);
2071 }
2072 }
2073 2203
2074 return 0; 2204 ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
2205
2206 BUG_ON(!ret && !bch_keylist_empty(&insert));
2207out:
2208 if (upgrade)
2209 downgrade_write(&b->lock);
2210 return ret;
2075} 2211}
2076 2212
2077int bch_btree_insert(struct btree_op *op, struct cache_set *c) 2213struct btree_insert_op {
2214 struct btree_op op;
2215 struct keylist *keys;
2216 atomic_t *journal_ref;
2217 struct bkey *replace_key;
2218};
2219
2220int btree_insert_fn(struct btree_op *b_op, struct btree *b)
2078{ 2221{
2079 int ret = 0; 2222 struct btree_insert_op *op = container_of(b_op,
2080 struct keylist stack_keys; 2223 struct btree_insert_op, op);
2081 2224
2082 /* 2225 int ret = bch_btree_insert_node(b, &op->op, op->keys,
2083 * Don't want to block with the btree locked unless we have to, 2226 op->journal_ref, op->replace_key);
2084 * otherwise we get deadlocks with try_harder and between split/gc 2227 if (ret && !bch_keylist_empty(op->keys))
2085 */ 2228 return ret;
2086 clear_closure_blocking(&op->cl); 2229 else
2087 2230 return MAP_DONE;
2088 BUG_ON(bch_keylist_empty(&op->keys)); 2231}
2089 bch_keylist_copy(&stack_keys, &op->keys);
2090 bch_keylist_init(&op->keys);
2091
2092 while (!bch_keylist_empty(&stack_keys) ||
2093 !bch_keylist_empty(&op->keys)) {
2094 if (bch_keylist_empty(&op->keys)) {
2095 bch_keylist_add(&op->keys,
2096 bch_keylist_pop(&stack_keys));
2097 op->lock = 0;
2098 }
2099 2232
2100 ret = btree_root(insert_recurse, c, op, &stack_keys); 2233int bch_btree_insert(struct cache_set *c, struct keylist *keys,
2234 atomic_t *journal_ref, struct bkey *replace_key)
2235{
2236 struct btree_insert_op op;
2237 int ret = 0;
2101 2238
2102 if (ret == -EAGAIN) { 2239 BUG_ON(current->bio_list);
2103 ret = 0; 2240 BUG_ON(bch_keylist_empty(keys));
2104 closure_sync(&op->cl); 2241
2105 } else if (ret) { 2242 bch_btree_op_init(&op.op, 0);
2106 struct bkey *k; 2243 op.keys = keys;
2244 op.journal_ref = journal_ref;
2245 op.replace_key = replace_key;
2246
2247 while (!ret && !bch_keylist_empty(keys)) {
2248 op.op.lock = 0;
2249 ret = bch_btree_map_leaf_nodes(&op.op, c,
2250 &START_KEY(keys->keys),
2251 btree_insert_fn);
2252 }
2107 2253
2108 pr_err("error %i trying to insert key for %s", 2254 if (ret) {
2109 ret, op_type(op)); 2255 struct bkey *k;
2110 2256
2111 while ((k = bch_keylist_pop(&stack_keys) ?: 2257 pr_err("error %i", ret);
2112 bch_keylist_pop(&op->keys)))
2113 bkey_put(c, k, 0);
2114 }
2115 }
2116 2258
2117 bch_keylist_free(&stack_keys); 2259 while ((k = bch_keylist_pop(keys)))
2260 bkey_put(c, k);
2261 } else if (op.op.insert_collision)
2262 ret = -ESRCH;
2118 2263
2119 if (op->journal)
2120 atomic_dec_bug(op->journal);
2121 op->journal = NULL;
2122 return ret; 2264 return ret;
2123} 2265}
2124 2266
@@ -2141,132 +2283,81 @@ void bch_btree_set_root(struct btree *b)
2141 mutex_unlock(&b->c->bucket_lock); 2283 mutex_unlock(&b->c->bucket_lock);
2142 2284
2143 b->c->root = b; 2285 b->c->root = b;
2144 __bkey_put(b->c, &b->key);
2145 2286
2146 bch_journal_meta(b->c, &cl); 2287 bch_journal_meta(b->c, &cl);
2147 closure_sync(&cl); 2288 closure_sync(&cl);
2148} 2289}
2149 2290
2150/* Cache lookup */ 2291/* Map across nodes or keys */
2151 2292
2152static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, 2293static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
2153 struct bkey *k) 2294 struct bkey *from,
2295 btree_map_nodes_fn *fn, int flags)
2154{ 2296{
2155 struct search *s = container_of(op, struct search, op); 2297 int ret = MAP_CONTINUE;
2156 struct bio *bio = &s->bio.bio; 2298
2157 int ret = 0; 2299 if (b->level) {
2300 struct bkey *k;
2301 struct btree_iter iter;
2158 2302
2159 while (!ret && 2303 bch_btree_iter_init(b, &iter, from);
2160 !op->lookup_done) {
2161 unsigned sectors = INT_MAX;
2162 2304
2163 if (KEY_INODE(k) == op->inode) { 2305 while ((k = bch_btree_iter_next_filter(&iter, b,
2164 if (KEY_START(k) <= bio->bi_sector) 2306 bch_ptr_bad))) {
2165 break; 2307 ret = btree(map_nodes_recurse, k, b,
2308 op, from, fn, flags);
2309 from = NULL;
2166 2310
2167 sectors = min_t(uint64_t, sectors, 2311 if (ret != MAP_CONTINUE)
2168 KEY_START(k) - bio->bi_sector); 2312 return ret;
2169 } 2313 }
2170
2171 ret = s->d->cache_miss(b, s, bio, sectors);
2172 } 2314 }
2173 2315
2316 if (!b->level || flags == MAP_ALL_NODES)
2317 ret = fn(op, b);
2318
2174 return ret; 2319 return ret;
2175} 2320}
2176 2321
2177/* 2322int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
2178 * Read from a single key, handling the initial cache miss if the key starts in 2323 struct bkey *from, btree_map_nodes_fn *fn, int flags)
2179 * the middle of the bio
2180 */
2181static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
2182 struct bkey *k)
2183{ 2324{
2184 struct search *s = container_of(op, struct search, op); 2325 return btree_root(map_nodes_recurse, c, op, from, fn, flags);
2185 struct bio *bio = &s->bio.bio;
2186 unsigned ptr;
2187 struct bio *n;
2188
2189 int ret = submit_partial_cache_miss(b, op, k);
2190 if (ret || op->lookup_done)
2191 return ret;
2192
2193 /* XXX: figure out best pointer - for multiple cache devices */
2194 ptr = 0;
2195
2196 PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
2197
2198 while (!op->lookup_done &&
2199 KEY_INODE(k) == op->inode &&
2200 bio->bi_sector < KEY_OFFSET(k)) {
2201 struct bkey *bio_key;
2202 sector_t sector = PTR_OFFSET(k, ptr) +
2203 (bio->bi_sector - KEY_START(k));
2204 unsigned sectors = min_t(uint64_t, INT_MAX,
2205 KEY_OFFSET(k) - bio->bi_sector);
2206
2207 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
2208 if (n == bio)
2209 op->lookup_done = true;
2210
2211 bio_key = &container_of(n, struct bbio, bio)->key;
2212
2213 /*
2214 * The bucket we're reading from might be reused while our bio
2215 * is in flight, and we could then end up reading the wrong
2216 * data.
2217 *
2218 * We guard against this by checking (in cache_read_endio()) if
2219 * the pointer is stale again; if so, we treat it as an error
2220 * and reread from the backing device (but we don't pass that
2221 * error up anywhere).
2222 */
2223
2224 bch_bkey_copy_single_ptr(bio_key, k, ptr);
2225 SET_PTR_OFFSET(bio_key, 0, sector);
2226
2227 n->bi_end_io = bch_cache_read_endio;
2228 n->bi_private = &s->cl;
2229
2230 __bch_submit_bbio(n, b->c);
2231 }
2232
2233 return 0;
2234} 2326}
2235 2327
2236int bch_btree_search_recurse(struct btree *b, struct btree_op *op) 2328static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
2329 struct bkey *from, btree_map_keys_fn *fn,
2330 int flags)
2237{ 2331{
2238 struct search *s = container_of(op, struct search, op); 2332 int ret = MAP_CONTINUE;
2239 struct bio *bio = &s->bio.bio;
2240
2241 int ret = 0;
2242 struct bkey *k; 2333 struct bkey *k;
2243 struct btree_iter iter; 2334 struct btree_iter iter;
2244 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
2245 2335
2246 do { 2336 bch_btree_iter_init(b, &iter, from);
2247 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
2248 if (!k) {
2249 /*
2250 * b->key would be exactly what we want, except that
2251 * pointers to btree nodes have nonzero size - we
2252 * wouldn't go far enough
2253 */
2254 2337
2255 ret = submit_partial_cache_miss(b, op, 2338 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) {
2256 &KEY(KEY_INODE(&b->key), 2339 ret = !b->level
2257 KEY_OFFSET(&b->key), 0)); 2340 ? fn(op, b, k)
2258 break; 2341 : btree(map_keys_recurse, k, b, op, from, fn, flags);
2259 } 2342 from = NULL;
2343
2344 if (ret != MAP_CONTINUE)
2345 return ret;
2346 }
2260 2347
2261 ret = b->level 2348 if (!b->level && (flags & MAP_END_KEY))
2262 ? btree(search_recurse, k, b, op) 2349 ret = fn(op, b, &KEY(KEY_INODE(&b->key),
2263 : submit_partial_cache_hit(b, op, k); 2350 KEY_OFFSET(&b->key), 0));
2264 } while (!ret &&
2265 !op->lookup_done);
2266 2351
2267 return ret; 2352 return ret;
2268} 2353}
2269 2354
2355int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
2356 struct bkey *from, btree_map_keys_fn *fn, int flags)
2357{
2358 return btree_root(map_keys_recurse, c, op, from, fn, flags);
2359}
2360
2270/* Keybuf code */ 2361/* Keybuf code */
2271 2362
2272static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) 2363static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
@@ -2285,80 +2376,79 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2285 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); 2376 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
2286} 2377}
2287 2378
2288static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, 2379struct refill {
2289 struct keybuf *buf, struct bkey *end, 2380 struct btree_op op;
2290 keybuf_pred_fn *pred) 2381 unsigned nr_found;
2291{ 2382 struct keybuf *buf;
2292 struct btree_iter iter; 2383 struct bkey *end;
2293 bch_btree_iter_init(b, &iter, &buf->last_scanned); 2384 keybuf_pred_fn *pred;
2294 2385};
2295 while (!array_freelist_empty(&buf->freelist)) {
2296 struct bkey *k = bch_btree_iter_next_filter(&iter, b,
2297 bch_ptr_bad);
2298
2299 if (!b->level) {
2300 if (!k) {
2301 buf->last_scanned = b->key;
2302 break;
2303 }
2304 2386
2305 buf->last_scanned = *k; 2387static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
2306 if (bkey_cmp(&buf->last_scanned, end) >= 0) 2388 struct bkey *k)
2307 break; 2389{
2390 struct refill *refill = container_of(op, struct refill, op);
2391 struct keybuf *buf = refill->buf;
2392 int ret = MAP_CONTINUE;
2308 2393
2309 if (pred(buf, k)) { 2394 if (bkey_cmp(k, refill->end) >= 0) {
2310 struct keybuf_key *w; 2395 ret = MAP_DONE;
2396 goto out;
2397 }
2311 2398
2312 spin_lock(&buf->lock); 2399 if (!KEY_SIZE(k)) /* end key */
2400 goto out;
2313 2401
2314 w = array_alloc(&buf->freelist); 2402 if (refill->pred(buf, k)) {
2403 struct keybuf_key *w;
2315 2404
2316 w->private = NULL; 2405 spin_lock(&buf->lock);
2317 bkey_copy(&w->key, k);
2318 2406
2319 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) 2407 w = array_alloc(&buf->freelist);
2320 array_free(&buf->freelist, w); 2408 if (!w) {
2409 spin_unlock(&buf->lock);
2410 return MAP_DONE;
2411 }
2321 2412
2322 spin_unlock(&buf->lock); 2413 w->private = NULL;
2323 } 2414 bkey_copy(&w->key, k);
2324 } else {
2325 if (!k)
2326 break;
2327 2415
2328 btree(refill_keybuf, k, b, op, buf, end, pred); 2416 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2329 /* 2417 array_free(&buf->freelist, w);
2330 * Might get an error here, but can't really do anything 2418 else
2331 * and it'll get logged elsewhere. Just read what we 2419 refill->nr_found++;
2332 * can.
2333 */
2334 2420
2335 if (bkey_cmp(&buf->last_scanned, end) >= 0) 2421 if (array_freelist_empty(&buf->freelist))
2336 break; 2422 ret = MAP_DONE;
2337 2423
2338 cond_resched(); 2424 spin_unlock(&buf->lock);
2339 }
2340 } 2425 }
2341 2426out:
2342 return 0; 2427 buf->last_scanned = *k;
2428 return ret;
2343} 2429}
2344 2430
2345void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, 2431void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2346 struct bkey *end, keybuf_pred_fn *pred) 2432 struct bkey *end, keybuf_pred_fn *pred)
2347{ 2433{
2348 struct bkey start = buf->last_scanned; 2434 struct bkey start = buf->last_scanned;
2349 struct btree_op op; 2435 struct refill refill;
2350 bch_btree_op_init_stack(&op);
2351 2436
2352 cond_resched(); 2437 cond_resched();
2353 2438
2354 btree_root(refill_keybuf, c, &op, buf, end, pred); 2439 bch_btree_op_init(&refill.op, -1);
2355 closure_sync(&op.cl); 2440 refill.nr_found = 0;
2441 refill.buf = buf;
2442 refill.end = end;
2443 refill.pred = pred;
2444
2445 bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
2446 refill_keybuf_fn, MAP_END_KEY);
2356 2447
2357 pr_debug("found %s keys from %llu:%llu to %llu:%llu", 2448 trace_bcache_keyscan(refill.nr_found,
2358 RB_EMPTY_ROOT(&buf->keys) ? "no" : 2449 KEY_INODE(&start), KEY_OFFSET(&start),
2359 array_freelist_empty(&buf->freelist) ? "some" : "a few", 2450 KEY_INODE(&buf->last_scanned),
2360 KEY_INODE(&start), KEY_OFFSET(&start), 2451 KEY_OFFSET(&buf->last_scanned));
2361 KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned));
2362 2452
2363 spin_lock(&buf->lock); 2453 spin_lock(&buf->lock);
2364 2454
@@ -2436,9 +2526,9 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2436} 2526}
2437 2527
2438struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, 2528struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2439 struct keybuf *buf, 2529 struct keybuf *buf,
2440 struct bkey *end, 2530 struct bkey *end,
2441 keybuf_pred_fn *pred) 2531 keybuf_pred_fn *pred)
2442{ 2532{
2443 struct keybuf_key *ret; 2533 struct keybuf_key *ret;
2444 2534
@@ -2471,14 +2561,12 @@ void bch_btree_exit(void)
2471{ 2561{
2472 if (btree_io_wq) 2562 if (btree_io_wq)
2473 destroy_workqueue(btree_io_wq); 2563 destroy_workqueue(btree_io_wq);
2474 if (bch_gc_wq)
2475 destroy_workqueue(bch_gc_wq);
2476} 2564}
2477 2565
2478int __init bch_btree_init(void) 2566int __init bch_btree_init(void)
2479{ 2567{
2480 if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || 2568 btree_io_wq = create_singlethread_workqueue("bch_btree_io");
2481 !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) 2569 if (!btree_io_wq)
2482 return -ENOMEM; 2570 return -ENOMEM;
2483 2571
2484 return 0; 2572 return 0;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 3333d3723633..767e75570896 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -125,6 +125,7 @@ struct btree {
125 unsigned long seq; 125 unsigned long seq;
126 struct rw_semaphore lock; 126 struct rw_semaphore lock;
127 struct cache_set *c; 127 struct cache_set *c;
128 struct btree *parent;
128 129
129 unsigned long flags; 130 unsigned long flags;
130 uint16_t written; /* would be nice to kill */ 131 uint16_t written; /* would be nice to kill */
@@ -200,12 +201,7 @@ static inline bool bkey_written(struct btree *b, struct bkey *k)
200 201
201static inline void set_gc_sectors(struct cache_set *c) 202static inline void set_gc_sectors(struct cache_set *c)
202{ 203{
203 atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); 204 atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
204}
205
206static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
207{
208 return __bch_ptr_invalid(b->c, b->level, k);
209} 205}
210 206
211static inline struct bkey *bch_btree_iter_init(struct btree *b, 207static inline struct bkey *bch_btree_iter_init(struct btree *b,
@@ -215,6 +211,16 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b,
215 return __bch_btree_iter_init(b, iter, search, b->sets); 211 return __bch_btree_iter_init(b, iter, search, b->sets);
216} 212}
217 213
214static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
215{
216 if (b->level)
217 return bch_btree_ptr_invalid(b->c, k);
218 else
219 return bch_extent_ptr_invalid(b->c, k);
220}
221
222void bkey_put(struct cache_set *c, struct bkey *k);
223
218/* Looping macros */ 224/* Looping macros */
219 225
220#define for_each_cached_btree(b, c, iter) \ 226#define for_each_cached_btree(b, c, iter) \
@@ -234,51 +240,17 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b,
234/* Recursing down the btree */ 240/* Recursing down the btree */
235 241
236struct btree_op { 242struct btree_op {
237 struct closure cl;
238 struct cache_set *c;
239
240 /* Journal entry we have a refcount on */
241 atomic_t *journal;
242
243 /* Bio to be inserted into the cache */
244 struct bio *cache_bio;
245
246 unsigned inode;
247
248 uint16_t write_prio;
249
250 /* Btree level at which we start taking write locks */ 243 /* Btree level at which we start taking write locks */
251 short lock; 244 short lock;
252 245
253 /* Btree insertion type */
254 enum {
255 BTREE_INSERT,
256 BTREE_REPLACE
257 } type:8;
258
259 unsigned csum:1;
260 unsigned skip:1;
261 unsigned flush_journal:1;
262
263 unsigned insert_data_done:1;
264 unsigned lookup_done:1;
265 unsigned insert_collision:1; 246 unsigned insert_collision:1;
266
267 /* Anything after this point won't get zeroed in do_bio_hook() */
268
269 /* Keys to be inserted */
270 struct keylist keys;
271 BKEY_PADDED(replace);
272}; 247};
273 248
274enum { 249static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
275 BTREE_INSERT_STATUS_INSERT, 250{
276 BTREE_INSERT_STATUS_BACK_MERGE, 251 memset(op, 0, sizeof(struct btree_op));
277 BTREE_INSERT_STATUS_OVERWROTE, 252 op->lock = write_lock_level;
278 BTREE_INSERT_STATUS_FRONT_MERGE, 253}
279};
280
281void bch_btree_op_init_stack(struct btree_op *);
282 254
283static inline void rw_lock(bool w, struct btree *b, int level) 255static inline void rw_lock(bool w, struct btree *b, int level)
284{ 256{
@@ -290,108 +262,71 @@ static inline void rw_lock(bool w, struct btree *b, int level)
290 262
291static inline void rw_unlock(bool w, struct btree *b) 263static inline void rw_unlock(bool w, struct btree *b)
292{ 264{
293#ifdef CONFIG_BCACHE_EDEBUG
294 unsigned i;
295
296 if (w && b->key.ptr[0])
297 for (i = 0; i <= b->nsets; i++)
298 bch_check_key_order(b, b->sets[i].data);
299#endif
300
301 if (w) 265 if (w)
302 b->seq++; 266 b->seq++;
303 (w ? up_write : up_read)(&b->lock); 267 (w ? up_write : up_read)(&b->lock);
304} 268}
305 269
306#define insert_lock(s, b) ((b)->level <= (s)->lock) 270void bch_btree_node_read(struct btree *);
271void bch_btree_node_write(struct btree *, struct closure *);
307 272
308/* 273void bch_btree_set_root(struct btree *);
309 * These macros are for recursing down the btree - they handle the details of 274struct btree *bch_btree_node_alloc(struct cache_set *, int, bool);
310 * locking and looking up nodes in the cache for you. They're best treated as 275struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
311 * mere syntax when reading code that uses them.
312 *
313 * op->lock determines whether we take a read or a write lock at a given depth.
314 * If you've got a read lock and find that you need a write lock (i.e. you're
315 * going to have to split), set op->lock and return -EINTR; btree_root() will
316 * call you again and you'll have the correct lock.
317 */
318 276
319/** 277int bch_btree_insert_check_key(struct btree *, struct btree_op *,
320 * btree - recurse down the btree on a specified key 278 struct bkey *);
321 * @fn: function to call, which will be passed the child node 279int bch_btree_insert(struct cache_set *, struct keylist *,
322 * @key: key to recurse on 280 atomic_t *, struct bkey *);
323 * @b: parent btree node 281
324 * @op: pointer to struct btree_op 282int bch_gc_thread_start(struct cache_set *);
325 */ 283size_t bch_btree_gc_finish(struct cache_set *);
326#define btree(fn, key, b, op, ...) \ 284void bch_moving_gc(struct cache_set *);
327({ \ 285int bch_btree_check(struct cache_set *);
328 int _r, l = (b)->level - 1; \ 286uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
329 bool _w = l <= (op)->lock; \
330 struct btree *_b = bch_btree_node_get((b)->c, key, l, op); \
331 if (!IS_ERR(_b)) { \
332 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
333 rw_unlock(_w, _b); \
334 } else \
335 _r = PTR_ERR(_b); \
336 _r; \
337})
338
339/**
340 * btree_root - call a function on the root of the btree
341 * @fn: function to call, which will be passed the child node
342 * @c: cache set
343 * @op: pointer to struct btree_op
344 */
345#define btree_root(fn, c, op, ...) \
346({ \
347 int _r = -EINTR; \
348 do { \
349 struct btree *_b = (c)->root; \
350 bool _w = insert_lock(op, _b); \
351 rw_lock(_w, _b, _b->level); \
352 if (_b == (c)->root && \
353 _w == insert_lock(op, _b)) \
354 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
355 rw_unlock(_w, _b); \
356 bch_cannibalize_unlock(c, &(op)->cl); \
357 } while (_r == -EINTR); \
358 \
359 _r; \
360})
361 287
362static inline bool should_split(struct btree *b) 288static inline void wake_up_gc(struct cache_set *c)
363{ 289{
364 struct bset *i = write_block(b); 290 if (c->gc_thread)
365 return b->written >= btree_blocks(b) || 291 wake_up_process(c->gc_thread);
366 (i->seq == b->sets[0].data->seq &&
367 b->written + __set_blocks(i, i->keys + 15, b->c)
368 > btree_blocks(b));
369} 292}
370 293
371void bch_btree_node_read(struct btree *); 294#define MAP_DONE 0
372void bch_btree_node_write(struct btree *, struct closure *); 295#define MAP_CONTINUE 1
373 296
374void bch_cannibalize_unlock(struct cache_set *, struct closure *); 297#define MAP_ALL_NODES 0
375void bch_btree_set_root(struct btree *); 298#define MAP_LEAF_NODES 1
376struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
377struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
378 int, struct btree_op *);
379 299
380bool bch_btree_insert_check_key(struct btree *, struct btree_op *, 300#define MAP_END_KEY 1
381 struct bio *);
382int bch_btree_insert(struct btree_op *, struct cache_set *);
383 301
384int bch_btree_search_recurse(struct btree *, struct btree_op *); 302typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *);
303int __bch_btree_map_nodes(struct btree_op *, struct cache_set *,
304 struct bkey *, btree_map_nodes_fn *, int);
385 305
386void bch_queue_gc(struct cache_set *); 306static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
387size_t bch_btree_gc_finish(struct cache_set *); 307 struct bkey *from, btree_map_nodes_fn *fn)
388void bch_moving_gc(struct closure *); 308{
389int bch_btree_check(struct cache_set *, struct btree_op *); 309 return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES);
390uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); 310}
311
312static inline int bch_btree_map_leaf_nodes(struct btree_op *op,
313 struct cache_set *c,
314 struct bkey *from,
315 btree_map_nodes_fn *fn)
316{
317 return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES);
318}
319
320typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *,
321 struct bkey *);
322int bch_btree_map_keys(struct btree_op *, struct cache_set *,
323 struct bkey *, btree_map_keys_fn *, int);
324
325typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
391 326
392void bch_keybuf_init(struct keybuf *); 327void bch_keybuf_init(struct keybuf *);
393void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, 328void bch_refill_keybuf(struct cache_set *, struct keybuf *,
394 keybuf_pred_fn *); 329 struct bkey *, keybuf_pred_fn *);
395bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, 330bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
396 struct bkey *); 331 struct bkey *);
397void bch_keybuf_del(struct keybuf *, struct keybuf_key *); 332void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 9aba2017f0d1..dfff2410322e 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -11,17 +11,6 @@
11 11
12#include "closure.h" 12#include "closure.h"
13 13
14void closure_queue(struct closure *cl)
15{
16 struct workqueue_struct *wq = cl->wq;
17 if (wq) {
18 INIT_WORK(&cl->work, cl->work.func);
19 BUG_ON(!queue_work(wq, &cl->work));
20 } else
21 cl->fn(cl);
22}
23EXPORT_SYMBOL_GPL(closure_queue);
24
25#define CL_FIELD(type, field) \ 14#define CL_FIELD(type, field) \
26 case TYPE_ ## type: \ 15 case TYPE_ ## type: \
27 return &container_of(cl, struct type, cl)->field 16 return &container_of(cl, struct type, cl)->field
@@ -30,17 +19,6 @@ static struct closure_waitlist *closure_waitlist(struct closure *cl)
30{ 19{
31 switch (cl->type) { 20 switch (cl->type) {
32 CL_FIELD(closure_with_waitlist, wait); 21 CL_FIELD(closure_with_waitlist, wait);
33 CL_FIELD(closure_with_waitlist_and_timer, wait);
34 default:
35 return NULL;
36 }
37}
38
39static struct timer_list *closure_timer(struct closure *cl)
40{
41 switch (cl->type) {
42 CL_FIELD(closure_with_timer, timer);
43 CL_FIELD(closure_with_waitlist_and_timer, timer);
44 default: 22 default:
45 return NULL; 23 return NULL;
46 } 24 }
@@ -51,7 +29,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
51 int r = flags & CLOSURE_REMAINING_MASK; 29 int r = flags & CLOSURE_REMAINING_MASK;
52 30
53 BUG_ON(flags & CLOSURE_GUARD_MASK); 31 BUG_ON(flags & CLOSURE_GUARD_MASK);
54 BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING))); 32 BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
55 33
56 /* Must deliver precisely one wakeup */ 34 /* Must deliver precisely one wakeup */
57 if (r == 1 && (flags & CLOSURE_SLEEPING)) 35 if (r == 1 && (flags & CLOSURE_SLEEPING))
@@ -59,7 +37,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
59 37
60 if (!r) { 38 if (!r) {
61 if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { 39 if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
62 /* CLOSURE_BLOCKING might be set - clear it */
63 atomic_set(&cl->remaining, 40 atomic_set(&cl->remaining,
64 CLOSURE_REMAINING_INITIALIZER); 41 CLOSURE_REMAINING_INITIALIZER);
65 closure_queue(cl); 42 closure_queue(cl);
@@ -90,13 +67,13 @@ void closure_sub(struct closure *cl, int v)
90{ 67{
91 closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); 68 closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
92} 69}
93EXPORT_SYMBOL_GPL(closure_sub); 70EXPORT_SYMBOL(closure_sub);
94 71
95void closure_put(struct closure *cl) 72void closure_put(struct closure *cl)
96{ 73{
97 closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); 74 closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
98} 75}
99EXPORT_SYMBOL_GPL(closure_put); 76EXPORT_SYMBOL(closure_put);
100 77
101static void set_waiting(struct closure *cl, unsigned long f) 78static void set_waiting(struct closure *cl, unsigned long f)
102{ 79{
@@ -133,7 +110,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
133 closure_sub(cl, CLOSURE_WAITING + 1); 110 closure_sub(cl, CLOSURE_WAITING + 1);
134 } 111 }
135} 112}
136EXPORT_SYMBOL_GPL(__closure_wake_up); 113EXPORT_SYMBOL(__closure_wake_up);
137 114
138bool closure_wait(struct closure_waitlist *list, struct closure *cl) 115bool closure_wait(struct closure_waitlist *list, struct closure *cl)
139{ 116{
@@ -146,7 +123,7 @@ bool closure_wait(struct closure_waitlist *list, struct closure *cl)
146 123
147 return true; 124 return true;
148} 125}
149EXPORT_SYMBOL_GPL(closure_wait); 126EXPORT_SYMBOL(closure_wait);
150 127
151/** 128/**
152 * closure_sync() - sleep until a closure a closure has nothing left to wait on 129 * closure_sync() - sleep until a closure a closure has nothing left to wait on
@@ -169,7 +146,7 @@ void closure_sync(struct closure *cl)
169 146
170 __closure_end_sleep(cl); 147 __closure_end_sleep(cl);
171} 148}
172EXPORT_SYMBOL_GPL(closure_sync); 149EXPORT_SYMBOL(closure_sync);
173 150
174/** 151/**
175 * closure_trylock() - try to acquire the closure, without waiting 152 * closure_trylock() - try to acquire the closure, without waiting
@@ -183,17 +160,17 @@ bool closure_trylock(struct closure *cl, struct closure *parent)
183 CLOSURE_REMAINING_INITIALIZER) != -1) 160 CLOSURE_REMAINING_INITIALIZER) != -1)
184 return false; 161 return false;
185 162
186 closure_set_ret_ip(cl);
187
188 smp_mb(); 163 smp_mb();
164
189 cl->parent = parent; 165 cl->parent = parent;
190 if (parent) 166 if (parent)
191 closure_get(parent); 167 closure_get(parent);
192 168
169 closure_set_ret_ip(cl);
193 closure_debug_create(cl); 170 closure_debug_create(cl);
194 return true; 171 return true;
195} 172}
196EXPORT_SYMBOL_GPL(closure_trylock); 173EXPORT_SYMBOL(closure_trylock);
197 174
198void __closure_lock(struct closure *cl, struct closure *parent, 175void __closure_lock(struct closure *cl, struct closure *parent,
199 struct closure_waitlist *wait_list) 176 struct closure_waitlist *wait_list)
@@ -205,57 +182,11 @@ void __closure_lock(struct closure *cl, struct closure *parent,
205 if (closure_trylock(cl, parent)) 182 if (closure_trylock(cl, parent))
206 return; 183 return;
207 184
208 closure_wait_event_sync(wait_list, &wait, 185 closure_wait_event(wait_list, &wait,
209 atomic_read(&cl->remaining) == -1); 186 atomic_read(&cl->remaining) == -1);
210 } 187 }
211} 188}
212EXPORT_SYMBOL_GPL(__closure_lock); 189EXPORT_SYMBOL(__closure_lock);
213
214static void closure_delay_timer_fn(unsigned long data)
215{
216 struct closure *cl = (struct closure *) data;
217 closure_sub(cl, CLOSURE_TIMER + 1);
218}
219
220void do_closure_timer_init(struct closure *cl)
221{
222 struct timer_list *timer = closure_timer(cl);
223
224 init_timer(timer);
225 timer->data = (unsigned long) cl;
226 timer->function = closure_delay_timer_fn;
227}
228EXPORT_SYMBOL_GPL(do_closure_timer_init);
229
230bool __closure_delay(struct closure *cl, unsigned long delay,
231 struct timer_list *timer)
232{
233 if (atomic_read(&cl->remaining) & CLOSURE_TIMER)
234 return false;
235
236 BUG_ON(timer_pending(timer));
237
238 timer->expires = jiffies + delay;
239
240 atomic_add(CLOSURE_TIMER + 1, &cl->remaining);
241 add_timer(timer);
242 return true;
243}
244EXPORT_SYMBOL_GPL(__closure_delay);
245
246void __closure_flush(struct closure *cl, struct timer_list *timer)
247{
248 if (del_timer(timer))
249 closure_sub(cl, CLOSURE_TIMER + 1);
250}
251EXPORT_SYMBOL_GPL(__closure_flush);
252
253void __closure_flush_sync(struct closure *cl, struct timer_list *timer)
254{
255 if (del_timer_sync(timer))
256 closure_sub(cl, CLOSURE_TIMER + 1);
257}
258EXPORT_SYMBOL_GPL(__closure_flush_sync);
259 190
260#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 191#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
261 192
@@ -273,7 +204,7 @@ void closure_debug_create(struct closure *cl)
273 list_add(&cl->all, &closure_list); 204 list_add(&cl->all, &closure_list);
274 spin_unlock_irqrestore(&closure_list_lock, flags); 205 spin_unlock_irqrestore(&closure_list_lock, flags);
275} 206}
276EXPORT_SYMBOL_GPL(closure_debug_create); 207EXPORT_SYMBOL(closure_debug_create);
277 208
278void closure_debug_destroy(struct closure *cl) 209void closure_debug_destroy(struct closure *cl)
279{ 210{
@@ -286,7 +217,7 @@ void closure_debug_destroy(struct closure *cl)
286 list_del(&cl->all); 217 list_del(&cl->all);
287 spin_unlock_irqrestore(&closure_list_lock, flags); 218 spin_unlock_irqrestore(&closure_list_lock, flags);
288} 219}
289EXPORT_SYMBOL_GPL(closure_debug_destroy); 220EXPORT_SYMBOL(closure_debug_destroy);
290 221
291static struct dentry *debug; 222static struct dentry *debug;
292 223
@@ -304,14 +235,12 @@ static int debug_seq_show(struct seq_file *f, void *data)
304 cl, (void *) cl->ip, cl->fn, cl->parent, 235 cl, (void *) cl->ip, cl->fn, cl->parent,
305 r & CLOSURE_REMAINING_MASK); 236 r & CLOSURE_REMAINING_MASK);
306 237
307 seq_printf(f, "%s%s%s%s%s%s\n", 238 seq_printf(f, "%s%s%s%s\n",
308 test_bit(WORK_STRUCT_PENDING, 239 test_bit(WORK_STRUCT_PENDING,
309 work_data_bits(&cl->work)) ? "Q" : "", 240 work_data_bits(&cl->work)) ? "Q" : "",
310 r & CLOSURE_RUNNING ? "R" : "", 241 r & CLOSURE_RUNNING ? "R" : "",
311 r & CLOSURE_BLOCKING ? "B" : "",
312 r & CLOSURE_STACK ? "S" : "", 242 r & CLOSURE_STACK ? "S" : "",
313 r & CLOSURE_SLEEPING ? "Sl" : "", 243 r & CLOSURE_SLEEPING ? "Sl" : "");
314 r & CLOSURE_TIMER ? "T" : "");
315 244
316 if (r & CLOSURE_WAITING) 245 if (r & CLOSURE_WAITING)
317 seq_printf(f, " W %pF\n", 246 seq_printf(f, " W %pF\n",
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 00039924ea9d..9762f1be3304 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -155,21 +155,6 @@
155 * delayed_work embeds a work item and a timer_list. The important thing is, use 155 * delayed_work embeds a work item and a timer_list. The important thing is, use
156 * it exactly like you would a regular closure and closure_put() will magically 156 * it exactly like you would a regular closure and closure_put() will magically
157 * handle everything for you. 157 * handle everything for you.
158 *
159 * We've got closures that embed timers, too. They're called, appropriately
160 * enough:
161 * struct closure_with_timer;
162 *
163 * This gives you access to closure_delay(). It takes a refcount for a specified
164 * number of jiffies - you could then call closure_sync() (for a slightly
165 * convoluted version of msleep()) or continue_at() - which gives you the same
166 * effect as using a delayed work item, except you can reuse the work_struct
167 * already embedded in struct closure.
168 *
169 * Lastly, there's struct closure_with_waitlist_and_timer. It does what you
170 * probably expect, if you happen to need the features of both. (You don't
171 * really want to know how all this is implemented, but if I've done my job
172 * right you shouldn't have to care).
173 */ 158 */
174 159
175struct closure; 160struct closure;
@@ -182,16 +167,11 @@ struct closure_waitlist {
182enum closure_type { 167enum closure_type {
183 TYPE_closure = 0, 168 TYPE_closure = 0,
184 TYPE_closure_with_waitlist = 1, 169 TYPE_closure_with_waitlist = 1,
185 TYPE_closure_with_timer = 2, 170 MAX_CLOSURE_TYPE = 1,
186 TYPE_closure_with_waitlist_and_timer = 3,
187 MAX_CLOSURE_TYPE = 3,
188}; 171};
189 172
190enum closure_state { 173enum closure_state {
191 /* 174 /*
192 * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of
193 * waiting asynchronously
194 *
195 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by 175 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
196 * the thread that owns the closure, and cleared by the thread that's 176 * the thread that owns the closure, and cleared by the thread that's
197 * waking up the closure. 177 * waking up the closure.
@@ -200,10 +180,6 @@ enum closure_state {
200 * - indicates that cl->task is valid and closure_put() may wake it up. 180 * - indicates that cl->task is valid and closure_put() may wake it up.
201 * Only set or cleared by the thread that owns the closure. 181 * Only set or cleared by the thread that owns the closure.
202 * 182 *
203 * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure
204 * has an outstanding timer. Must be set by the thread that owns the
205 * closure, and cleared by the timer function when the timer goes off.
206 *
207 * The rest are for debugging and don't affect behaviour: 183 * The rest are for debugging and don't affect behaviour:
208 * 184 *
209 * CLOSURE_RUNNING: Set when a closure is running (i.e. by 185 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
@@ -218,19 +194,17 @@ enum closure_state {
218 * closure with this flag set 194 * closure with this flag set
219 */ 195 */
220 196
221 CLOSURE_BITS_START = (1 << 19), 197 CLOSURE_BITS_START = (1 << 23),
222 CLOSURE_DESTRUCTOR = (1 << 19), 198 CLOSURE_DESTRUCTOR = (1 << 23),
223 CLOSURE_BLOCKING = (1 << 21), 199 CLOSURE_WAITING = (1 << 25),
224 CLOSURE_WAITING = (1 << 23), 200 CLOSURE_SLEEPING = (1 << 27),
225 CLOSURE_SLEEPING = (1 << 25),
226 CLOSURE_TIMER = (1 << 27),
227 CLOSURE_RUNNING = (1 << 29), 201 CLOSURE_RUNNING = (1 << 29),
228 CLOSURE_STACK = (1 << 31), 202 CLOSURE_STACK = (1 << 31),
229}; 203};
230 204
231#define CLOSURE_GUARD_MASK \ 205#define CLOSURE_GUARD_MASK \
232 ((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING| \ 206 ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \
233 CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1) 207 CLOSURE_RUNNING|CLOSURE_STACK) << 1)
234 208
235#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) 209#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
236#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) 210#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
@@ -268,17 +242,6 @@ struct closure_with_waitlist {
268 struct closure_waitlist wait; 242 struct closure_waitlist wait;
269}; 243};
270 244
271struct closure_with_timer {
272 struct closure cl;
273 struct timer_list timer;
274};
275
276struct closure_with_waitlist_and_timer {
277 struct closure cl;
278 struct closure_waitlist wait;
279 struct timer_list timer;
280};
281
282extern unsigned invalid_closure_type(void); 245extern unsigned invalid_closure_type(void);
283 246
284#define __CLOSURE_TYPE(cl, _t) \ 247#define __CLOSURE_TYPE(cl, _t) \
@@ -289,14 +252,11 @@ extern unsigned invalid_closure_type(void);
289( \ 252( \
290 __CLOSURE_TYPE(cl, closure) \ 253 __CLOSURE_TYPE(cl, closure) \
291 __CLOSURE_TYPE(cl, closure_with_waitlist) \ 254 __CLOSURE_TYPE(cl, closure_with_waitlist) \
292 __CLOSURE_TYPE(cl, closure_with_timer) \
293 __CLOSURE_TYPE(cl, closure_with_waitlist_and_timer) \
294 invalid_closure_type() \ 255 invalid_closure_type() \
295) 256)
296 257
297void closure_sub(struct closure *cl, int v); 258void closure_sub(struct closure *cl, int v);
298void closure_put(struct closure *cl); 259void closure_put(struct closure *cl);
299void closure_queue(struct closure *cl);
300void __closure_wake_up(struct closure_waitlist *list); 260void __closure_wake_up(struct closure_waitlist *list);
301bool closure_wait(struct closure_waitlist *list, struct closure *cl); 261bool closure_wait(struct closure_waitlist *list, struct closure *cl);
302void closure_sync(struct closure *cl); 262void closure_sync(struct closure *cl);
@@ -305,12 +265,6 @@ bool closure_trylock(struct closure *cl, struct closure *parent);
305void __closure_lock(struct closure *cl, struct closure *parent, 265void __closure_lock(struct closure *cl, struct closure *parent,
306 struct closure_waitlist *wait_list); 266 struct closure_waitlist *wait_list);
307 267
308void do_closure_timer_init(struct closure *cl);
309bool __closure_delay(struct closure *cl, unsigned long delay,
310 struct timer_list *timer);
311void __closure_flush(struct closure *cl, struct timer_list *timer);
312void __closure_flush_sync(struct closure *cl, struct timer_list *timer);
313
314#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 268#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
315 269
316void closure_debug_init(void); 270void closure_debug_init(void);
@@ -354,11 +308,6 @@ static inline void closure_set_stopped(struct closure *cl)
354 atomic_sub(CLOSURE_RUNNING, &cl->remaining); 308 atomic_sub(CLOSURE_RUNNING, &cl->remaining);
355} 309}
356 310
357static inline bool closure_is_stopped(struct closure *cl)
358{
359 return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING);
360}
361
362static inline bool closure_is_unlocked(struct closure *cl) 311static inline bool closure_is_unlocked(struct closure *cl)
363{ 312{
364 return atomic_read(&cl->remaining) == -1; 313 return atomic_read(&cl->remaining) == -1;
@@ -367,14 +316,6 @@ static inline bool closure_is_unlocked(struct closure *cl)
367static inline void do_closure_init(struct closure *cl, struct closure *parent, 316static inline void do_closure_init(struct closure *cl, struct closure *parent,
368 bool running) 317 bool running)
369{ 318{
370 switch (cl->type) {
371 case TYPE_closure_with_timer:
372 case TYPE_closure_with_waitlist_and_timer:
373 do_closure_timer_init(cl);
374 default:
375 break;
376 }
377
378 cl->parent = parent; 319 cl->parent = parent;
379 if (parent) 320 if (parent)
380 closure_get(parent); 321 closure_get(parent);
@@ -429,8 +370,7 @@ do { \
429static inline void closure_init_stack(struct closure *cl) 370static inline void closure_init_stack(struct closure *cl)
430{ 371{
431 memset(cl, 0, sizeof(struct closure)); 372 memset(cl, 0, sizeof(struct closure));
432 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER| 373 atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
433 CLOSURE_BLOCKING|CLOSURE_STACK);
434} 374}
435 375
436/** 376/**
@@ -461,24 +401,6 @@ do { \
461#define closure_lock(cl, parent) \ 401#define closure_lock(cl, parent) \
462 __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) 402 __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
463 403
464/**
465 * closure_delay() - delay some number of jiffies
466 * @cl: the closure that will sleep
467 * @delay: the delay in jiffies
468 *
469 * Takes a refcount on @cl which will be released after @delay jiffies; this may
470 * be used to have a function run after a delay with continue_at(), or
471 * closure_sync() may be used for a convoluted version of msleep().
472 */
473#define closure_delay(cl, delay) \
474 __closure_delay(__to_internal_closure(cl), delay, &(cl)->timer)
475
476#define closure_flush(cl) \
477 __closure_flush(__to_internal_closure(cl), &(cl)->timer)
478
479#define closure_flush_sync(cl) \
480 __closure_flush_sync(__to_internal_closure(cl), &(cl)->timer)
481
482static inline void __closure_end_sleep(struct closure *cl) 404static inline void __closure_end_sleep(struct closure *cl)
483{ 405{
484 __set_current_state(TASK_RUNNING); 406 __set_current_state(TASK_RUNNING);
@@ -498,40 +420,6 @@ static inline void __closure_start_sleep(struct closure *cl)
498} 420}
499 421
500/** 422/**
501 * closure_blocking() - returns true if the closure is in blocking mode.
502 *
503 * If a closure is in blocking mode, closure_wait_event() will sleep until the
504 * condition is true instead of waiting asynchronously.
505 */
506static inline bool closure_blocking(struct closure *cl)
507{
508 return atomic_read(&cl->remaining) & CLOSURE_BLOCKING;
509}
510
511/**
512 * set_closure_blocking() - put a closure in blocking mode.
513 *
514 * If a closure is in blocking mode, closure_wait_event() will sleep until the
515 * condition is true instead of waiting asynchronously.
516 *
517 * Not thread safe - can only be called by the thread running the closure.
518 */
519static inline void set_closure_blocking(struct closure *cl)
520{
521 if (!closure_blocking(cl))
522 atomic_add(CLOSURE_BLOCKING, &cl->remaining);
523}
524
525/*
526 * Not thread safe - can only be called by the thread running the closure.
527 */
528static inline void clear_closure_blocking(struct closure *cl)
529{
530 if (closure_blocking(cl))
531 atomic_sub(CLOSURE_BLOCKING, &cl->remaining);
532}
533
534/**
535 * closure_wake_up() - wake up all closures on a wait list. 423 * closure_wake_up() - wake up all closures on a wait list.
536 */ 424 */
537static inline void closure_wake_up(struct closure_waitlist *list) 425static inline void closure_wake_up(struct closure_waitlist *list)
@@ -561,63 +449,36 @@ static inline void closure_wake_up(struct closure_waitlist *list)
561 * refcount on our closure. If this was a stack allocated closure, that would be 449 * refcount on our closure. If this was a stack allocated closure, that would be
562 * bad. 450 * bad.
563 */ 451 */
564#define __closure_wait_event(list, cl, condition, _block) \ 452#define closure_wait_event(list, cl, condition) \
565({ \ 453({ \
566 bool block = _block; \
567 typeof(condition) ret; \ 454 typeof(condition) ret; \
568 \ 455 \
569 while (1) { \ 456 while (1) { \
570 ret = (condition); \ 457 ret = (condition); \
571 if (ret) { \ 458 if (ret) { \
572 __closure_wake_up(list); \ 459 __closure_wake_up(list); \
573 if (block) \ 460 closure_sync(cl); \
574 closure_sync(cl); \
575 \
576 break; \ 461 break; \
577 } \ 462 } \
578 \ 463 \
579 if (block) \ 464 __closure_start_sleep(cl); \
580 __closure_start_sleep(cl); \
581 \
582 if (!closure_wait(list, cl)) { \
583 if (!block) \
584 break; \
585 \ 465 \
466 if (!closure_wait(list, cl)) \
586 schedule(); \ 467 schedule(); \
587 } \
588 } \ 468 } \
589 \ 469 \
590 ret; \ 470 ret; \
591}) 471})
592 472
593/** 473static inline void closure_queue(struct closure *cl)
594 * closure_wait_event() - wait on a condition, synchronously or asynchronously. 474{
595 * @list: the wait list to wait on 475 struct workqueue_struct *wq = cl->wq;
596 * @cl: the closure that is doing the waiting 476 if (wq) {
597 * @condition: a C expression for the event to wait for 477 INIT_WORK(&cl->work, cl->work.func);
598 * 478 BUG_ON(!queue_work(wq, &cl->work));
599 * If the closure is in blocking mode, sleeps until the @condition evaluates to 479 } else
600 * true - exactly like wait_event(). 480 cl->fn(cl);
601 * 481}
602 * If the closure is not in blocking mode, waits asynchronously; if the
603 * condition is currently false the @cl is put onto @list and returns. @list
604 * owns a refcount on @cl; closure_sync() or continue_at() may be used later to
605 * wait for another thread to wake up @list, which drops the refcount on @cl.
606 *
607 * Returns the value of @condition; @cl will be on @list iff @condition was
608 * false.
609 *
610 * closure_wake_up(@list) must be called after changing any variable that could
611 * cause @condition to become true.
612 */
613#define closure_wait_event(list, cl, condition) \
614 __closure_wait_event(list, cl, condition, closure_blocking(cl))
615
616#define closure_wait_event_async(list, cl, condition) \
617 __closure_wait_event(list, cl, condition, false)
618
619#define closure_wait_event_sync(list, cl, condition) \
620 __closure_wait_event(list, cl, condition, true)
621 482
622static inline void set_closure_fn(struct closure *cl, closure_fn *fn, 483static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
623 struct workqueue_struct *wq) 484 struct workqueue_struct *wq)
@@ -642,7 +503,7 @@ do { \
642#define continue_at_nobarrier(_cl, _fn, _wq) \ 503#define continue_at_nobarrier(_cl, _fn, _wq) \
643do { \ 504do { \
644 set_closure_fn(_cl, _fn, _wq); \ 505 set_closure_fn(_cl, _fn, _wq); \
645 closure_queue(cl); \ 506 closure_queue(_cl); \
646 return; \ 507 return; \
647} while (0) 508} while (0)
648 509
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 88e6411eab4f..264fcfbd6290 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -8,7 +8,6 @@
8#include "bcache.h" 8#include "bcache.h"
9#include "btree.h" 9#include "btree.h"
10#include "debug.h" 10#include "debug.h"
11#include "request.h"
12 11
13#include <linux/console.h> 12#include <linux/console.h>
14#include <linux/debugfs.h> 13#include <linux/debugfs.h>
@@ -77,29 +76,17 @@ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
77 return out - buf; 76 return out - buf;
78} 77}
79 78
80int bch_btree_to_text(char *buf, size_t size, const struct btree *b) 79#ifdef CONFIG_BCACHE_DEBUG
81{
82 return scnprintf(buf, size, "%zu level %i/%i",
83 PTR_BUCKET_NR(b->c, &b->key, 0),
84 b->level, b->c->root ? b->c->root->level : -1);
85}
86
87#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
88
89static bool skipped_backwards(struct btree *b, struct bkey *k)
90{
91 return bkey_cmp(k, (!b->level)
92 ? &START_KEY(bkey_next(k))
93 : bkey_next(k)) > 0;
94}
95 80
96static void dump_bset(struct btree *b, struct bset *i) 81static void dump_bset(struct btree *b, struct bset *i)
97{ 82{
98 struct bkey *k; 83 struct bkey *k, *next;
99 unsigned j; 84 unsigned j;
100 char buf[80]; 85 char buf[80];
101 86
102 for (k = i->start; k < end(i); k = bkey_next(k)) { 87 for (k = i->start; k < end(i); k = next) {
88 next = bkey_next(k);
89
103 bch_bkey_to_text(buf, sizeof(buf), k); 90 bch_bkey_to_text(buf, sizeof(buf), k);
104 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), 91 printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
105 (uint64_t *) k - i->d, i->keys, buf); 92 (uint64_t *) k - i->d, i->keys, buf);
@@ -115,15 +102,21 @@ static void dump_bset(struct btree *b, struct bset *i)
115 102
116 printk(" %s\n", bch_ptr_status(b->c, k)); 103 printk(" %s\n", bch_ptr_status(b->c, k));
117 104
118 if (bkey_next(k) < end(i) && 105 if (next < end(i) &&
119 skipped_backwards(b, k)) 106 bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0)
120 printk(KERN_ERR "Key skipped backwards\n"); 107 printk(KERN_ERR "Key skipped backwards\n");
121 } 108 }
122} 109}
123 110
124#endif 111static void bch_dump_bucket(struct btree *b)
112{
113 unsigned i;
125 114
126#ifdef CONFIG_BCACHE_DEBUG 115 console_lock();
116 for (i = 0; i <= b->nsets; i++)
117 dump_bset(b, b->sets[i].data);
118 console_unlock();
119}
127 120
128void bch_btree_verify(struct btree *b, struct bset *new) 121void bch_btree_verify(struct btree *b, struct bset *new)
129{ 122{
@@ -176,66 +169,44 @@ void bch_btree_verify(struct btree *b, struct bset *new)
176 mutex_unlock(&b->c->verify_lock); 169 mutex_unlock(&b->c->verify_lock);
177} 170}
178 171
179static void data_verify_endio(struct bio *bio, int error) 172void bch_data_verify(struct cached_dev *dc, struct bio *bio)
180{
181 struct closure *cl = bio->bi_private;
182 closure_put(cl);
183}
184
185void bch_data_verify(struct search *s)
186{ 173{
187 char name[BDEVNAME_SIZE]; 174 char name[BDEVNAME_SIZE];
188 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
189 struct closure *cl = &s->cl;
190 struct bio *check; 175 struct bio *check;
191 struct bio_vec *bv; 176 struct bio_vec *bv;
192 int i; 177 int i;
193 178
194 if (!s->unaligned_bvec) 179 check = bio_clone(bio, GFP_NOIO);
195 bio_for_each_segment(bv, s->orig_bio, i)
196 bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
197
198 check = bio_clone(s->orig_bio, GFP_NOIO);
199 if (!check) 180 if (!check)
200 return; 181 return;
201 182
202 if (bio_alloc_pages(check, GFP_NOIO)) 183 if (bio_alloc_pages(check, GFP_NOIO))
203 goto out_put; 184 goto out_put;
204 185
205 check->bi_rw = READ_SYNC; 186 submit_bio_wait(READ_SYNC, check);
206 check->bi_private = cl;
207 check->bi_end_io = data_verify_endio;
208
209 closure_bio_submit(check, cl, &dc->disk);
210 closure_sync(cl);
211 187
212 bio_for_each_segment(bv, s->orig_bio, i) { 188 bio_for_each_segment(bv, bio, i) {
213 void *p1 = kmap(bv->bv_page); 189 void *p1 = kmap_atomic(bv->bv_page);
214 void *p2 = kmap(check->bi_io_vec[i].bv_page); 190 void *p2 = page_address(check->bi_io_vec[i].bv_page);
215 191
216 if (memcmp(p1 + bv->bv_offset, 192 cache_set_err_on(memcmp(p1 + bv->bv_offset,
217 p2 + bv->bv_offset, 193 p2 + bv->bv_offset,
218 bv->bv_len)) 194 bv->bv_len),
219 printk(KERN_ERR 195 dc->disk.c,
220 "bcache (%s): verify failed at sector %llu\n", 196 "verify failed at dev %s sector %llu",
221 bdevname(dc->bdev, name), 197 bdevname(dc->bdev, name),
222 (uint64_t) s->orig_bio->bi_sector); 198 (uint64_t) bio->bi_sector);
223 199
224 kunmap(bv->bv_page); 200 kunmap_atomic(p1);
225 kunmap(check->bi_io_vec[i].bv_page);
226 } 201 }
227 202
228 __bio_for_each_segment(bv, check, i, 0) 203 bio_for_each_segment_all(bv, check, i)
229 __free_page(bv->bv_page); 204 __free_page(bv->bv_page);
230out_put: 205out_put:
231 bio_put(check); 206 bio_put(check);
232} 207}
233 208
234#endif 209int __bch_count_data(struct btree *b)
235
236#ifdef CONFIG_BCACHE_EDEBUG
237
238unsigned bch_count_data(struct btree *b)
239{ 210{
240 unsigned ret = 0; 211 unsigned ret = 0;
241 struct btree_iter iter; 212 struct btree_iter iter;
@@ -247,72 +218,60 @@ unsigned bch_count_data(struct btree *b)
247 return ret; 218 return ret;
248} 219}
249 220
250static void vdump_bucket_and_panic(struct btree *b, const char *fmt, 221void __bch_check_keys(struct btree *b, const char *fmt, ...)
251 va_list args)
252{
253 unsigned i;
254 char buf[80];
255
256 console_lock();
257
258 for (i = 0; i <= b->nsets; i++)
259 dump_bset(b, b->sets[i].data);
260
261 vprintk(fmt, args);
262
263 console_unlock();
264
265 bch_btree_to_text(buf, sizeof(buf), b);
266 panic("at %s\n", buf);
267}
268
269void bch_check_key_order_msg(struct btree *b, struct bset *i,
270 const char *fmt, ...)
271{
272 struct bkey *k;
273
274 if (!i->keys)
275 return;
276
277 for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k))
278 if (skipped_backwards(b, k)) {
279 va_list args;
280 va_start(args, fmt);
281
282 vdump_bucket_and_panic(b, fmt, args);
283 va_end(args);
284 }
285}
286
287void bch_check_keys(struct btree *b, const char *fmt, ...)
288{ 222{
289 va_list args; 223 va_list args;
290 struct bkey *k, *p = NULL; 224 struct bkey *k, *p = NULL;
291 struct btree_iter iter; 225 struct btree_iter iter;
292 226 const char *err;
293 if (b->level)
294 return;
295 227
296 for_each_key(b, k, &iter) { 228 for_each_key(b, k, &iter) {
297 if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { 229 if (!b->level) {
298 printk(KERN_ERR "Keys out of order:\n"); 230 err = "Keys out of order";
299 goto bug; 231 if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
300 } 232 goto bug;
301 233
302 if (bch_ptr_invalid(b, k)) 234 if (bch_ptr_invalid(b, k))
303 continue; 235 continue;
304 236
305 if (p && bkey_cmp(p, &START_KEY(k)) > 0) { 237 err = "Overlapping keys";
306 printk(KERN_ERR "Overlapping keys:\n"); 238 if (p && bkey_cmp(p, &START_KEY(k)) > 0)
307 goto bug; 239 goto bug;
240 } else {
241 if (bch_ptr_bad(b, k))
242 continue;
243
244 err = "Duplicate keys";
245 if (p && !bkey_cmp(p, k))
246 goto bug;
308 } 247 }
309 p = k; 248 p = k;
310 } 249 }
250
251 err = "Key larger than btree node key";
252 if (p && bkey_cmp(p, &b->key) > 0)
253 goto bug;
254
311 return; 255 return;
312bug: 256bug:
257 bch_dump_bucket(b);
258
313 va_start(args, fmt); 259 va_start(args, fmt);
314 vdump_bucket_and_panic(b, fmt, args); 260 vprintk(fmt, args);
315 va_end(args); 261 va_end(args);
262
263 panic("bcache error: %s:\n", err);
264}
265
266void bch_btree_iter_next_check(struct btree_iter *iter)
267{
268 struct bkey *k = iter->data->k, *next = bkey_next(k);
269
270 if (next < iter->data->end &&
271 bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) {
272 bch_dump_bucket(iter->b);
273 panic("Key skipped backwards\n");
274 }
316} 275}
317 276
318#endif 277#endif
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index 1c39b5a2489b..2ede60e31874 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -4,40 +4,44 @@
4/* Btree/bkey debug printing */ 4/* Btree/bkey debug printing */
5 5
6int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); 6int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
7int bch_btree_to_text(char *buf, size_t size, const struct btree *b);
8
9#ifdef CONFIG_BCACHE_EDEBUG
10
11unsigned bch_count_data(struct btree *);
12void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...);
13void bch_check_keys(struct btree *, const char *, ...);
14
15#define bch_check_key_order(b, i) \
16 bch_check_key_order_msg(b, i, "keys out of order")
17#define EBUG_ON(cond) BUG_ON(cond)
18
19#else /* EDEBUG */
20
21#define bch_count_data(b) 0
22#define bch_check_key_order(b, i) do {} while (0)
23#define bch_check_key_order_msg(b, i, ...) do {} while (0)
24#define bch_check_keys(b, ...) do {} while (0)
25#define EBUG_ON(cond) do {} while (0)
26
27#endif
28 7
29#ifdef CONFIG_BCACHE_DEBUG 8#ifdef CONFIG_BCACHE_DEBUG
30 9
31void bch_btree_verify(struct btree *, struct bset *); 10void bch_btree_verify(struct btree *, struct bset *);
32void bch_data_verify(struct search *); 11void bch_data_verify(struct cached_dev *, struct bio *);
12int __bch_count_data(struct btree *);
13void __bch_check_keys(struct btree *, const char *, ...);
14void bch_btree_iter_next_check(struct btree_iter *);
15
16#define EBUG_ON(cond) BUG_ON(cond)
17#define expensive_debug_checks(c) ((c)->expensive_debug_checks)
18#define key_merging_disabled(c) ((c)->key_merging_disabled)
19#define bypass_torture_test(d) ((d)->bypass_torture_test)
33 20
34#else /* DEBUG */ 21#else /* DEBUG */
35 22
36static inline void bch_btree_verify(struct btree *b, struct bset *i) {} 23static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
37static inline void bch_data_verify(struct search *s) {}; 24static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
25static inline int __bch_count_data(struct btree *b) { return -1; }
26static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {}
27static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
28
29#define EBUG_ON(cond) do { if (cond); } while (0)
30#define expensive_debug_checks(c) 0
31#define key_merging_disabled(c) 0
32#define bypass_torture_test(d) 0
38 33
39#endif 34#endif
40 35
36#define bch_count_data(b) \
37 (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1)
38
39#define bch_check_keys(b, ...) \
40do { \
41 if (expensive_debug_checks((b)->c)) \
42 __bch_check_keys(b, __VA_ARGS__); \
43} while (0)
44
41#ifdef CONFIG_DEBUG_FS 45#ifdef CONFIG_DEBUG_FS
42void bch_debug_init_cache_set(struct cache_set *); 46void bch_debug_init_cache_set(struct cache_set *);
43#else 47#else
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 8435f81e5d85..ecdaa671bd50 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -7,7 +7,6 @@
7#include "bcache.h" 7#include "bcache.h"
8#include "btree.h" 8#include "btree.h"
9#include "debug.h" 9#include "debug.h"
10#include "request.h"
11 10
12#include <trace/events/bcache.h> 11#include <trace/events/bcache.h>
13 12
@@ -31,17 +30,20 @@ static void journal_read_endio(struct bio *bio, int error)
31} 30}
32 31
33static int journal_read_bucket(struct cache *ca, struct list_head *list, 32static int journal_read_bucket(struct cache *ca, struct list_head *list,
34 struct btree_op *op, unsigned bucket_index) 33 unsigned bucket_index)
35{ 34{
36 struct journal_device *ja = &ca->journal; 35 struct journal_device *ja = &ca->journal;
37 struct bio *bio = &ja->bio; 36 struct bio *bio = &ja->bio;
38 37
39 struct journal_replay *i; 38 struct journal_replay *i;
40 struct jset *j, *data = ca->set->journal.w[0].data; 39 struct jset *j, *data = ca->set->journal.w[0].data;
40 struct closure cl;
41 unsigned len, left, offset = 0; 41 unsigned len, left, offset = 0;
42 int ret = 0; 42 int ret = 0;
43 sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); 43 sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
44 44
45 closure_init_stack(&cl);
46
45 pr_debug("reading %llu", (uint64_t) bucket); 47 pr_debug("reading %llu", (uint64_t) bucket);
46 48
47 while (offset < ca->sb.bucket_size) { 49 while (offset < ca->sb.bucket_size) {
@@ -55,11 +57,11 @@ reread: left = ca->sb.bucket_size - offset;
55 bio->bi_size = len << 9; 57 bio->bi_size = len << 9;
56 58
57 bio->bi_end_io = journal_read_endio; 59 bio->bi_end_io = journal_read_endio;
58 bio->bi_private = &op->cl; 60 bio->bi_private = &cl;
59 bch_bio_map(bio, data); 61 bch_bio_map(bio, data);
60 62
61 closure_bio_submit(bio, &op->cl, ca); 63 closure_bio_submit(bio, &cl, ca);
62 closure_sync(&op->cl); 64 closure_sync(&cl);
63 65
64 /* This function could be simpler now since we no longer write 66 /* This function could be simpler now since we no longer write
65 * journal entries that overlap bucket boundaries; this means 67 * journal entries that overlap bucket boundaries; this means
@@ -72,7 +74,7 @@ reread: left = ca->sb.bucket_size - offset;
72 struct list_head *where; 74 struct list_head *where;
73 size_t blocks, bytes = set_bytes(j); 75 size_t blocks, bytes = set_bytes(j);
74 76
75 if (j->magic != jset_magic(ca->set)) 77 if (j->magic != jset_magic(&ca->sb))
76 return ret; 78 return ret;
77 79
78 if (bytes > left << 9) 80 if (bytes > left << 9)
@@ -129,12 +131,11 @@ next_set:
129 return ret; 131 return ret;
130} 132}
131 133
132int bch_journal_read(struct cache_set *c, struct list_head *list, 134int bch_journal_read(struct cache_set *c, struct list_head *list)
133 struct btree_op *op)
134{ 135{
135#define read_bucket(b) \ 136#define read_bucket(b) \
136 ({ \ 137 ({ \
137 int ret = journal_read_bucket(ca, list, op, b); \ 138 int ret = journal_read_bucket(ca, list, b); \
138 __set_bit(b, bitmap); \ 139 __set_bit(b, bitmap); \
139 if (ret < 0) \ 140 if (ret < 0) \
140 return ret; \ 141 return ret; \
@@ -292,8 +293,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
292 } 293 }
293} 294}
294 295
295int bch_journal_replay(struct cache_set *s, struct list_head *list, 296int bch_journal_replay(struct cache_set *s, struct list_head *list)
296 struct btree_op *op)
297{ 297{
298 int ret = 0, keys = 0, entries = 0; 298 int ret = 0, keys = 0, entries = 0;
299 struct bkey *k; 299 struct bkey *k;
@@ -301,31 +301,30 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
301 list_entry(list->prev, struct journal_replay, list); 301 list_entry(list->prev, struct journal_replay, list);
302 302
303 uint64_t start = i->j.last_seq, end = i->j.seq, n = start; 303 uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
304 struct keylist keylist;
305
306 bch_keylist_init(&keylist);
304 307
305 list_for_each_entry(i, list, list) { 308 list_for_each_entry(i, list, list) {
306 BUG_ON(i->pin && atomic_read(i->pin) != 1); 309 BUG_ON(i->pin && atomic_read(i->pin) != 1);
307 310
308 if (n != i->j.seq) 311 cache_set_err_on(n != i->j.seq, s,
309 pr_err( 312"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
310 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n", 313 n, i->j.seq - 1, start, end);
311 n, i->j.seq - 1, start, end);
312 314
313 for (k = i->j.start; 315 for (k = i->j.start;
314 k < end(&i->j); 316 k < end(&i->j);
315 k = bkey_next(k)) { 317 k = bkey_next(k)) {
316 trace_bcache_journal_replay_key(k); 318 trace_bcache_journal_replay_key(k);
317 319
318 bkey_copy(op->keys.top, k); 320 bkey_copy(keylist.top, k);
319 bch_keylist_push(&op->keys); 321 bch_keylist_push(&keylist);
320
321 op->journal = i->pin;
322 atomic_inc(op->journal);
323 322
324 ret = bch_btree_insert(op, s); 323 ret = bch_btree_insert(s, &keylist, i->pin, NULL);
325 if (ret) 324 if (ret)
326 goto err; 325 goto err;
327 326
328 BUG_ON(!bch_keylist_empty(&op->keys)); 327 BUG_ON(!bch_keylist_empty(&keylist));
329 keys++; 328 keys++;
330 329
331 cond_resched(); 330 cond_resched();
@@ -339,14 +338,13 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
339 338
340 pr_info("journal replay done, %i keys in %i entries, seq %llu", 339 pr_info("journal replay done, %i keys in %i entries, seq %llu",
341 keys, entries, end); 340 keys, entries, end);
342 341err:
343 while (!list_empty(list)) { 342 while (!list_empty(list)) {
344 i = list_first_entry(list, struct journal_replay, list); 343 i = list_first_entry(list, struct journal_replay, list);
345 list_del(&i->list); 344 list_del(&i->list);
346 kfree(i); 345 kfree(i);
347 } 346 }
348err: 347
349 closure_sync(&op->cl);
350 return ret; 348 return ret;
351} 349}
352 350
@@ -358,48 +356,35 @@ static void btree_flush_write(struct cache_set *c)
358 * Try to find the btree node with that references the oldest journal 356 * Try to find the btree node with that references the oldest journal
359 * entry, best is our current candidate and is locked if non NULL: 357 * entry, best is our current candidate and is locked if non NULL:
360 */ 358 */
361 struct btree *b, *best = NULL; 359 struct btree *b, *best;
362 unsigned iter; 360 unsigned i;
361retry:
362 best = NULL;
363
364 for_each_cached_btree(b, c, i)
365 if (btree_current_write(b)->journal) {
366 if (!best)
367 best = b;
368 else if (journal_pin_cmp(c,
369 btree_current_write(best)->journal,
370 btree_current_write(b)->journal)) {
371 best = b;
372 }
373 }
363 374
364 for_each_cached_btree(b, c, iter) { 375 b = best;
365 if (!down_write_trylock(&b->lock)) 376 if (b) {
366 continue; 377 rw_lock(true, b, b->level);
367 378
368 if (!btree_node_dirty(b) || 379 if (!btree_current_write(b)->journal) {
369 !btree_current_write(b)->journal) {
370 rw_unlock(true, b); 380 rw_unlock(true, b);
371 continue; 381 /* We raced */
382 goto retry;
372 } 383 }
373 384
374 if (!best) 385 bch_btree_node_write(b, NULL);
375 best = b; 386 rw_unlock(true, b);
376 else if (journal_pin_cmp(c,
377 btree_current_write(best),
378 btree_current_write(b))) {
379 rw_unlock(true, best);
380 best = b;
381 } else
382 rw_unlock(true, b);
383 } 387 }
384
385 if (best)
386 goto out;
387
388 /* We can't find the best btree node, just pick the first */
389 list_for_each_entry(b, &c->btree_cache, list)
390 if (!b->level && btree_node_dirty(b)) {
391 best = b;
392 rw_lock(true, best, best->level);
393 goto found;
394 }
395
396out:
397 if (!best)
398 return;
399found:
400 if (btree_node_dirty(best))
401 bch_btree_node_write(best, NULL);
402 rw_unlock(true, best);
403} 388}
404 389
405#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) 390#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
@@ -495,7 +480,7 @@ static void journal_reclaim(struct cache_set *c)
495 do_journal_discard(ca); 480 do_journal_discard(ca);
496 481
497 if (c->journal.blocks_free) 482 if (c->journal.blocks_free)
498 return; 483 goto out;
499 484
500 /* 485 /*
501 * Allocate: 486 * Allocate:
@@ -521,7 +506,7 @@ static void journal_reclaim(struct cache_set *c)
521 506
522 if (n) 507 if (n)
523 c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; 508 c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
524 509out:
525 if (!journal_full(&c->journal)) 510 if (!journal_full(&c->journal))
526 __closure_wake_up(&c->journal.wait); 511 __closure_wake_up(&c->journal.wait);
527} 512}
@@ -554,32 +539,26 @@ static void journal_write_endio(struct bio *bio, int error)
554 struct journal_write *w = bio->bi_private; 539 struct journal_write *w = bio->bi_private;
555 540
556 cache_set_err_on(error, w->c, "journal io error"); 541 cache_set_err_on(error, w->c, "journal io error");
557 closure_put(&w->c->journal.io.cl); 542 closure_put(&w->c->journal.io);
558} 543}
559 544
560static void journal_write(struct closure *); 545static void journal_write(struct closure *);
561 546
562static void journal_write_done(struct closure *cl) 547static void journal_write_done(struct closure *cl)
563{ 548{
564 struct journal *j = container_of(cl, struct journal, io.cl); 549 struct journal *j = container_of(cl, struct journal, io);
565 struct cache_set *c = container_of(j, struct cache_set, journal);
566
567 struct journal_write *w = (j->cur == j->w) 550 struct journal_write *w = (j->cur == j->w)
568 ? &j->w[1] 551 ? &j->w[1]
569 : &j->w[0]; 552 : &j->w[0];
570 553
571 __closure_wake_up(&w->wait); 554 __closure_wake_up(&w->wait);
572 555 continue_at_nobarrier(cl, journal_write, system_wq);
573 if (c->journal_delay_ms)
574 closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms));
575
576 continue_at(cl, journal_write, system_wq);
577} 556}
578 557
579static void journal_write_unlocked(struct closure *cl) 558static void journal_write_unlocked(struct closure *cl)
580 __releases(c->journal.lock) 559 __releases(c->journal.lock)
581{ 560{
582 struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); 561 struct cache_set *c = container_of(cl, struct cache_set, journal.io);
583 struct cache *ca; 562 struct cache *ca;
584 struct journal_write *w = c->journal.cur; 563 struct journal_write *w = c->journal.cur;
585 struct bkey *k = &c->journal.key; 564 struct bkey *k = &c->journal.key;
@@ -617,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl)
617 for_each_cache(ca, c, i) 596 for_each_cache(ca, c, i)
618 w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; 597 w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
619 598
620 w->data->magic = jset_magic(c); 599 w->data->magic = jset_magic(&c->sb);
621 w->data->version = BCACHE_JSET_VERSION; 600 w->data->version = BCACHE_JSET_VERSION;
622 w->data->last_seq = last_seq(&c->journal); 601 w->data->last_seq = last_seq(&c->journal);
623 w->data->csum = csum_set(w->data); 602 w->data->csum = csum_set(w->data);
@@ -660,121 +639,134 @@ static void journal_write_unlocked(struct closure *cl)
660 639
661static void journal_write(struct closure *cl) 640static void journal_write(struct closure *cl)
662{ 641{
663 struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); 642 struct cache_set *c = container_of(cl, struct cache_set, journal.io);
664 643
665 spin_lock(&c->journal.lock); 644 spin_lock(&c->journal.lock);
666 journal_write_unlocked(cl); 645 journal_write_unlocked(cl);
667} 646}
668 647
669static void __journal_try_write(struct cache_set *c, bool noflush) 648static void journal_try_write(struct cache_set *c)
670 __releases(c->journal.lock) 649 __releases(c->journal.lock)
671{ 650{
672 struct closure *cl = &c->journal.io.cl; 651 struct closure *cl = &c->journal.io;
652 struct journal_write *w = c->journal.cur;
673 653
674 if (!closure_trylock(cl, &c->cl)) 654 w->need_write = true;
675 spin_unlock(&c->journal.lock); 655
676 else if (noflush && journal_full(&c->journal)) { 656 if (closure_trylock(cl, &c->cl))
677 spin_unlock(&c->journal.lock);
678 continue_at(cl, journal_write, system_wq);
679 } else
680 journal_write_unlocked(cl); 657 journal_write_unlocked(cl);
658 else
659 spin_unlock(&c->journal.lock);
681} 660}
682 661
683#define journal_try_write(c) __journal_try_write(c, false) 662static struct journal_write *journal_wait_for_write(struct cache_set *c,
684 663 unsigned nkeys)
685void bch_journal_meta(struct cache_set *c, struct closure *cl)
686{ 664{
687 struct journal_write *w; 665 size_t sectors;
666 struct closure cl;
688 667
689 if (CACHE_SYNC(&c->sb)) { 668 closure_init_stack(&cl);
690 spin_lock(&c->journal.lock); 669
670 spin_lock(&c->journal.lock);
691 671
692 w = c->journal.cur; 672 while (1) {
693 w->need_write = true; 673 struct journal_write *w = c->journal.cur;
694 674
695 if (cl) 675 sectors = __set_blocks(w->data, w->data->keys + nkeys,
696 BUG_ON(!closure_wait(&w->wait, cl)); 676 c) * c->sb.block_size;
697 677
698 closure_flush(&c->journal.io); 678 if (sectors <= min_t(size_t,
699 __journal_try_write(c, true); 679 c->journal.blocks_free * c->sb.block_size,
680 PAGE_SECTORS << JSET_BITS))
681 return w;
682
683 /* XXX: tracepoint */
684 if (!journal_full(&c->journal)) {
685 trace_bcache_journal_entry_full(c);
686
687 /*
688 * XXX: If we were inserting so many keys that they
689 * won't fit in an _empty_ journal write, we'll
690 * deadlock. For now, handle this in
691 * bch_keylist_realloc() - but something to think about.
692 */
693 BUG_ON(!w->data->keys);
694
695 closure_wait(&w->wait, &cl);
696 journal_try_write(c); /* unlocks */
697 } else {
698 trace_bcache_journal_full(c);
699
700 closure_wait(&c->journal.wait, &cl);
701 journal_reclaim(c);
702 spin_unlock(&c->journal.lock);
703
704 btree_flush_write(c);
705 }
706
707 closure_sync(&cl);
708 spin_lock(&c->journal.lock);
700 } 709 }
701} 710}
702 711
712static void journal_write_work(struct work_struct *work)
713{
714 struct cache_set *c = container_of(to_delayed_work(work),
715 struct cache_set,
716 journal.work);
717 spin_lock(&c->journal.lock);
718 journal_try_write(c);
719}
720
703/* 721/*
704 * Entry point to the journalling code - bio_insert() and btree_invalidate() 722 * Entry point to the journalling code - bio_insert() and btree_invalidate()
705 * pass bch_journal() a list of keys to be journalled, and then 723 * pass bch_journal() a list of keys to be journalled, and then
706 * bch_journal() hands those same keys off to btree_insert_async() 724 * bch_journal() hands those same keys off to btree_insert_async()
707 */ 725 */
708 726
709void bch_journal(struct closure *cl) 727atomic_t *bch_journal(struct cache_set *c,
728 struct keylist *keys,
729 struct closure *parent)
710{ 730{
711 struct btree_op *op = container_of(cl, struct btree_op, cl);
712 struct cache_set *c = op->c;
713 struct journal_write *w; 731 struct journal_write *w;
714 size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list; 732 atomic_t *ret;
715
716 if (op->type != BTREE_INSERT ||
717 !CACHE_SYNC(&c->sb))
718 goto out;
719 733
720 /* 734 if (!CACHE_SYNC(&c->sb))
721 * If we're looping because we errored, might already be waiting on 735 return NULL;
722 * another journal write:
723 */
724 while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING)
725 closure_sync(cl->parent);
726 736
727 spin_lock(&c->journal.lock); 737 w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
728 738
729 if (journal_full(&c->journal)) { 739 memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys));
730 trace_bcache_journal_full(c); 740 w->data->keys += bch_keylist_nkeys(keys);
731 741
732 closure_wait(&c->journal.wait, cl); 742 ret = &fifo_back(&c->journal.pin);
743 atomic_inc(ret);
733 744
734 journal_reclaim(c); 745 if (parent) {
746 closure_wait(&w->wait, parent);
747 journal_try_write(c);
748 } else if (!w->need_write) {
749 schedule_delayed_work(&c->journal.work,
750 msecs_to_jiffies(c->journal_delay_ms));
751 spin_unlock(&c->journal.lock);
752 } else {
735 spin_unlock(&c->journal.lock); 753 spin_unlock(&c->journal.lock);
736
737 btree_flush_write(c);
738 continue_at(cl, bch_journal, bcache_wq);
739 } 754 }
740 755
741 w = c->journal.cur;
742 w->need_write = true;
743 b = __set_blocks(w->data, w->data->keys + n, c);
744
745 if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
746 b > c->journal.blocks_free) {
747 trace_bcache_journal_entry_full(c);
748
749 /*
750 * XXX: If we were inserting so many keys that they won't fit in
751 * an _empty_ journal write, we'll deadlock. For now, handle
752 * this in bch_keylist_realloc() - but something to think about.
753 */
754 BUG_ON(!w->data->keys);
755
756 BUG_ON(!closure_wait(&w->wait, cl));
757
758 closure_flush(&c->journal.io);
759 756
760 journal_try_write(c); 757 return ret;
761 continue_at(cl, bch_journal, bcache_wq); 758}
762 }
763
764 memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t));
765 w->data->keys += n;
766 759
767 op->journal = &fifo_back(&c->journal.pin); 760void bch_journal_meta(struct cache_set *c, struct closure *cl)
768 atomic_inc(op->journal); 761{
762 struct keylist keys;
763 atomic_t *ref;
769 764
770 if (op->flush_journal) { 765 bch_keylist_init(&keys);
771 closure_flush(&c->journal.io);
772 closure_wait(&w->wait, cl->parent);
773 }
774 766
775 journal_try_write(c); 767 ref = bch_journal(c, &keys, cl);
776out: 768 if (ref)
777 bch_btree_insert_async(cl); 769 atomic_dec_bug(ref);
778} 770}
779 771
780void bch_journal_free(struct cache_set *c) 772void bch_journal_free(struct cache_set *c)
@@ -790,6 +782,7 @@ int bch_journal_alloc(struct cache_set *c)
790 782
791 closure_init_unlocked(&j->io); 783 closure_init_unlocked(&j->io);
792 spin_lock_init(&j->lock); 784 spin_lock_init(&j->lock);
785 INIT_DELAYED_WORK(&j->work, journal_write_work);
793 786
794 c->journal_delay_ms = 100; 787 c->journal_delay_ms = 100;
795 788
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 3d7851274b04..a6472fda94b2 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -75,43 +75,6 @@
75 * nodes that are pinning the oldest journal entries first. 75 * nodes that are pinning the oldest journal entries first.
76 */ 76 */
77 77
78#define BCACHE_JSET_VERSION_UUIDv1 1
79/* Always latest UUID format */
80#define BCACHE_JSET_VERSION_UUID 1
81#define BCACHE_JSET_VERSION 1
82
83/*
84 * On disk format for a journal entry:
85 * seq is monotonically increasing; every journal entry has its own unique
86 * sequence number.
87 *
88 * last_seq is the oldest journal entry that still has keys the btree hasn't
89 * flushed to disk yet.
90 *
91 * version is for on disk format changes.
92 */
93struct jset {
94 uint64_t csum;
95 uint64_t magic;
96 uint64_t seq;
97 uint32_t version;
98 uint32_t keys;
99
100 uint64_t last_seq;
101
102 BKEY_PADDED(uuid_bucket);
103 BKEY_PADDED(btree_root);
104 uint16_t btree_level;
105 uint16_t pad[3];
106
107 uint64_t prio_bucket[MAX_CACHES_PER_SET];
108
109 union {
110 struct bkey start[0];
111 uint64_t d[0];
112 };
113};
114
115/* 78/*
116 * Only used for holding the journal entries we read in btree_journal_read() 79 * Only used for holding the journal entries we read in btree_journal_read()
117 * during cache_registration 80 * during cache_registration
@@ -140,7 +103,8 @@ struct journal {
140 spinlock_t lock; 103 spinlock_t lock;
141 /* used when waiting because the journal was full */ 104 /* used when waiting because the journal was full */
142 struct closure_waitlist wait; 105 struct closure_waitlist wait;
143 struct closure_with_timer io; 106 struct closure io;
107 struct delayed_work work;
144 108
145 /* Number of blocks free in the bucket(s) we're currently writing to */ 109 /* Number of blocks free in the bucket(s) we're currently writing to */
146 unsigned blocks_free; 110 unsigned blocks_free;
@@ -188,8 +152,7 @@ struct journal_device {
188}; 152};
189 153
190#define journal_pin_cmp(c, l, r) \ 154#define journal_pin_cmp(c, l, r) \
191 (fifo_idx(&(c)->journal.pin, (l)->journal) > \ 155 (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
192 fifo_idx(&(c)->journal.pin, (r)->journal))
193 156
194#define JOURNAL_PIN 20000 157#define JOURNAL_PIN 20000
195 158
@@ -199,15 +162,14 @@ struct journal_device {
199struct closure; 162struct closure;
200struct cache_set; 163struct cache_set;
201struct btree_op; 164struct btree_op;
165struct keylist;
202 166
203void bch_journal(struct closure *); 167atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *);
204void bch_journal_next(struct journal *); 168void bch_journal_next(struct journal *);
205void bch_journal_mark(struct cache_set *, struct list_head *); 169void bch_journal_mark(struct cache_set *, struct list_head *);
206void bch_journal_meta(struct cache_set *, struct closure *); 170void bch_journal_meta(struct cache_set *, struct closure *);
207int bch_journal_read(struct cache_set *, struct list_head *, 171int bch_journal_read(struct cache_set *, struct list_head *);
208 struct btree_op *); 172int bch_journal_replay(struct cache_set *, struct list_head *);
209int bch_journal_replay(struct cache_set *, struct list_head *,
210 struct btree_op *);
211 173
212void bch_journal_free(struct cache_set *); 174void bch_journal_free(struct cache_set *);
213int bch_journal_alloc(struct cache_set *); 175int bch_journal_alloc(struct cache_set *);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 1a3b4f4786c3..7c1275e66025 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -12,8 +12,9 @@
12#include <trace/events/bcache.h> 12#include <trace/events/bcache.h>
13 13
14struct moving_io { 14struct moving_io {
15 struct closure cl;
15 struct keybuf_key *w; 16 struct keybuf_key *w;
16 struct search s; 17 struct data_insert_op op;
17 struct bbio bio; 18 struct bbio bio;
18}; 19};
19 20
@@ -38,13 +39,13 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
38 39
39static void moving_io_destructor(struct closure *cl) 40static void moving_io_destructor(struct closure *cl)
40{ 41{
41 struct moving_io *io = container_of(cl, struct moving_io, s.cl); 42 struct moving_io *io = container_of(cl, struct moving_io, cl);
42 kfree(io); 43 kfree(io);
43} 44}
44 45
45static void write_moving_finish(struct closure *cl) 46static void write_moving_finish(struct closure *cl)
46{ 47{
47 struct moving_io *io = container_of(cl, struct moving_io, s.cl); 48 struct moving_io *io = container_of(cl, struct moving_io, cl);
48 struct bio *bio = &io->bio.bio; 49 struct bio *bio = &io->bio.bio;
49 struct bio_vec *bv; 50 struct bio_vec *bv;
50 int i; 51 int i;
@@ -52,13 +53,12 @@ static void write_moving_finish(struct closure *cl)
52 bio_for_each_segment_all(bv, bio, i) 53 bio_for_each_segment_all(bv, bio, i)
53 __free_page(bv->bv_page); 54 __free_page(bv->bv_page);
54 55
55 if (io->s.op.insert_collision) 56 if (io->op.replace_collision)
56 trace_bcache_gc_copy_collision(&io->w->key); 57 trace_bcache_gc_copy_collision(&io->w->key);
57 58
58 bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); 59 bch_keybuf_del(&io->op.c->moving_gc_keys, io->w);
59 60
60 atomic_dec_bug(&io->s.op.c->in_flight); 61 up(&io->op.c->moving_in_flight);
61 closure_wake_up(&io->s.op.c->moving_gc_wait);
62 62
63 closure_return_with_destructor(cl, moving_io_destructor); 63 closure_return_with_destructor(cl, moving_io_destructor);
64} 64}
@@ -66,12 +66,12 @@ static void write_moving_finish(struct closure *cl)
66static void read_moving_endio(struct bio *bio, int error) 66static void read_moving_endio(struct bio *bio, int error)
67{ 67{
68 struct moving_io *io = container_of(bio->bi_private, 68 struct moving_io *io = container_of(bio->bi_private,
69 struct moving_io, s.cl); 69 struct moving_io, cl);
70 70
71 if (error) 71 if (error)
72 io->s.error = error; 72 io->op.error = error;
73 73
74 bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); 74 bch_bbio_endio(io->op.c, bio, error, "reading data to move");
75} 75}
76 76
77static void moving_init(struct moving_io *io) 77static void moving_init(struct moving_io *io)
@@ -85,54 +85,53 @@ static void moving_init(struct moving_io *io)
85 bio->bi_size = KEY_SIZE(&io->w->key) << 9; 85 bio->bi_size = KEY_SIZE(&io->w->key) << 9;
86 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), 86 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key),
87 PAGE_SECTORS); 87 PAGE_SECTORS);
88 bio->bi_private = &io->s.cl; 88 bio->bi_private = &io->cl;
89 bio->bi_io_vec = bio->bi_inline_vecs; 89 bio->bi_io_vec = bio->bi_inline_vecs;
90 bch_bio_map(bio, NULL); 90 bch_bio_map(bio, NULL);
91} 91}
92 92
93static void write_moving(struct closure *cl) 93static void write_moving(struct closure *cl)
94{ 94{
95 struct search *s = container_of(cl, struct search, cl); 95 struct moving_io *io = container_of(cl, struct moving_io, cl);
96 struct moving_io *io = container_of(s, struct moving_io, s); 96 struct data_insert_op *op = &io->op;
97 97
98 if (!s->error) { 98 if (!op->error) {
99 moving_init(io); 99 moving_init(io);
100 100
101 io->bio.bio.bi_sector = KEY_START(&io->w->key); 101 io->bio.bio.bi_sector = KEY_START(&io->w->key);
102 s->op.lock = -1; 102 op->write_prio = 1;
103 s->op.write_prio = 1; 103 op->bio = &io->bio.bio;
104 s->op.cache_bio = &io->bio.bio;
105 104
106 s->writeback = KEY_DIRTY(&io->w->key); 105 op->writeback = KEY_DIRTY(&io->w->key);
107 s->op.csum = KEY_CSUM(&io->w->key); 106 op->csum = KEY_CSUM(&io->w->key);
108 107
109 s->op.type = BTREE_REPLACE; 108 bkey_copy(&op->replace_key, &io->w->key);
110 bkey_copy(&s->op.replace, &io->w->key); 109 op->replace = true;
111 110
112 closure_init(&s->op.cl, cl); 111 closure_call(&op->cl, bch_data_insert, NULL, cl);
113 bch_insert_data(&s->op.cl);
114 } 112 }
115 113
116 continue_at(cl, write_moving_finish, NULL); 114 continue_at(cl, write_moving_finish, system_wq);
117} 115}
118 116
119static void read_moving_submit(struct closure *cl) 117static void read_moving_submit(struct closure *cl)
120{ 118{
121 struct search *s = container_of(cl, struct search, cl); 119 struct moving_io *io = container_of(cl, struct moving_io, cl);
122 struct moving_io *io = container_of(s, struct moving_io, s);
123 struct bio *bio = &io->bio.bio; 120 struct bio *bio = &io->bio.bio;
124 121
125 bch_submit_bbio(bio, s->op.c, &io->w->key, 0); 122 bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
126 123
127 continue_at(cl, write_moving, bch_gc_wq); 124 continue_at(cl, write_moving, system_wq);
128} 125}
129 126
130static void read_moving(struct closure *cl) 127static void read_moving(struct cache_set *c)
131{ 128{
132 struct cache_set *c = container_of(cl, struct cache_set, moving_gc);
133 struct keybuf_key *w; 129 struct keybuf_key *w;
134 struct moving_io *io; 130 struct moving_io *io;
135 struct bio *bio; 131 struct bio *bio;
132 struct closure cl;
133
134 closure_init_stack(&cl);
136 135
137 /* XXX: if we error, background writeback could stall indefinitely */ 136 /* XXX: if we error, background writeback could stall indefinitely */
138 137
@@ -150,8 +149,8 @@ static void read_moving(struct closure *cl)
150 149
151 w->private = io; 150 w->private = io;
152 io->w = w; 151 io->w = w;
153 io->s.op.inode = KEY_INODE(&w->key); 152 io->op.inode = KEY_INODE(&w->key);
154 io->s.op.c = c; 153 io->op.c = c;
155 154
156 moving_init(io); 155 moving_init(io);
157 bio = &io->bio.bio; 156 bio = &io->bio.bio;
@@ -164,13 +163,8 @@ static void read_moving(struct closure *cl)
164 163
165 trace_bcache_gc_copy(&w->key); 164 trace_bcache_gc_copy(&w->key);
166 165
167 closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); 166 down(&c->moving_in_flight);
168 167 closure_call(&io->cl, read_moving_submit, NULL, &cl);
169 if (atomic_inc_return(&c->in_flight) >= 64) {
170 closure_wait_event(&c->moving_gc_wait, cl,
171 atomic_read(&c->in_flight) < 64);
172 continue_at(cl, read_moving, bch_gc_wq);
173 }
174 } 168 }
175 169
176 if (0) { 170 if (0) {
@@ -180,7 +174,7 @@ err: if (!IS_ERR_OR_NULL(w->private))
180 bch_keybuf_del(&c->moving_gc_keys, w); 174 bch_keybuf_del(&c->moving_gc_keys, w);
181 } 175 }
182 176
183 closure_return(cl); 177 closure_sync(&cl);
184} 178}
185 179
186static bool bucket_cmp(struct bucket *l, struct bucket *r) 180static bool bucket_cmp(struct bucket *l, struct bucket *r)
@@ -193,15 +187,14 @@ static unsigned bucket_heap_top(struct cache *ca)
193 return GC_SECTORS_USED(heap_peek(&ca->heap)); 187 return GC_SECTORS_USED(heap_peek(&ca->heap));
194} 188}
195 189
196void bch_moving_gc(struct closure *cl) 190void bch_moving_gc(struct cache_set *c)
197{ 191{
198 struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
199 struct cache *ca; 192 struct cache *ca;
200 struct bucket *b; 193 struct bucket *b;
201 unsigned i; 194 unsigned i;
202 195
203 if (!c->copy_gc_enabled) 196 if (!c->copy_gc_enabled)
204 closure_return(cl); 197 return;
205 198
206 mutex_lock(&c->bucket_lock); 199 mutex_lock(&c->bucket_lock);
207 200
@@ -242,13 +235,11 @@ void bch_moving_gc(struct closure *cl)
242 235
243 c->moving_gc_keys.last_scanned = ZERO_KEY; 236 c->moving_gc_keys.last_scanned = ZERO_KEY;
244 237
245 closure_init(&c->moving_gc, cl); 238 read_moving(c);
246 read_moving(&c->moving_gc);
247
248 closure_return(cl);
249} 239}
250 240
251void bch_moving_init_cache_set(struct cache_set *c) 241void bch_moving_init_cache_set(struct cache_set *c)
252{ 242{
253 bch_keybuf_init(&c->moving_gc_keys); 243 bch_keybuf_init(&c->moving_gc_keys);
244 sema_init(&c->moving_in_flight, 64);
254} 245}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 2a7f0dd6abab..fbcc851ed5a5 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -25,7 +25,7 @@
25 25
26struct kmem_cache *bch_search_cache; 26struct kmem_cache *bch_search_cache;
27 27
28static void check_should_skip(struct cached_dev *, struct search *); 28static void bch_data_insert_start(struct closure *);
29 29
30/* Cgroup interface */ 30/* Cgroup interface */
31 31
@@ -213,221 +213,79 @@ static void bio_csum(struct bio *bio, struct bkey *k)
213 213
214/* Insert data into cache */ 214/* Insert data into cache */
215 215
216static void bio_invalidate(struct closure *cl) 216static void bch_data_insert_keys(struct closure *cl)
217{ 217{
218 struct btree_op *op = container_of(cl, struct btree_op, cl); 218 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
219 struct bio *bio = op->cache_bio; 219 atomic_t *journal_ref = NULL;
220 220 struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
221 pr_debug("invalidating %i sectors from %llu", 221 int ret;
222 bio_sectors(bio), (uint64_t) bio->bi_sector);
223
224 while (bio_sectors(bio)) {
225 unsigned len = min(bio_sectors(bio), 1U << 14);
226
227 if (bch_keylist_realloc(&op->keys, 0, op->c))
228 goto out;
229
230 bio->bi_sector += len;
231 bio->bi_size -= len << 9;
232
233 bch_keylist_add(&op->keys,
234 &KEY(op->inode, bio->bi_sector, len));
235 }
236
237 op->insert_data_done = true;
238 bio_put(bio);
239out:
240 continue_at(cl, bch_journal, bcache_wq);
241}
242
243struct open_bucket {
244 struct list_head list;
245 struct task_struct *last;
246 unsigned sectors_free;
247 BKEY_PADDED(key);
248};
249
250void bch_open_buckets_free(struct cache_set *c)
251{
252 struct open_bucket *b;
253
254 while (!list_empty(&c->data_buckets)) {
255 b = list_first_entry(&c->data_buckets,
256 struct open_bucket, list);
257 list_del(&b->list);
258 kfree(b);
259 }
260}
261
262int bch_open_buckets_alloc(struct cache_set *c)
263{
264 int i;
265
266 spin_lock_init(&c->data_bucket_lock);
267
268 for (i = 0; i < 6; i++) {
269 struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
270 if (!b)
271 return -ENOMEM;
272
273 list_add(&b->list, &c->data_buckets);
274 }
275
276 return 0;
277}
278
279/*
280 * We keep multiple buckets open for writes, and try to segregate different
281 * write streams for better cache utilization: first we look for a bucket where
282 * the last write to it was sequential with the current write, and failing that
283 * we look for a bucket that was last used by the same task.
284 *
285 * The ideas is if you've got multiple tasks pulling data into the cache at the
286 * same time, you'll get better cache utilization if you try to segregate their
287 * data and preserve locality.
288 *
289 * For example, say you've starting Firefox at the same time you're copying a
290 * bunch of files. Firefox will likely end up being fairly hot and stay in the
291 * cache awhile, but the data you copied might not be; if you wrote all that
292 * data to the same buckets it'd get invalidated at the same time.
293 *
294 * Both of those tasks will be doing fairly random IO so we can't rely on
295 * detecting sequential IO to segregate their data, but going off of the task
296 * should be a sane heuristic.
297 */
298static struct open_bucket *pick_data_bucket(struct cache_set *c,
299 const struct bkey *search,
300 struct task_struct *task,
301 struct bkey *alloc)
302{
303 struct open_bucket *ret, *ret_task = NULL;
304
305 list_for_each_entry_reverse(ret, &c->data_buckets, list)
306 if (!bkey_cmp(&ret->key, search))
307 goto found;
308 else if (ret->last == task)
309 ret_task = ret;
310
311 ret = ret_task ?: list_first_entry(&c->data_buckets,
312 struct open_bucket, list);
313found:
314 if (!ret->sectors_free && KEY_PTRS(alloc)) {
315 ret->sectors_free = c->sb.bucket_size;
316 bkey_copy(&ret->key, alloc);
317 bkey_init(alloc);
318 }
319
320 if (!ret->sectors_free)
321 ret = NULL;
322
323 return ret;
324}
325
326/*
327 * Allocates some space in the cache to write to, and k to point to the newly
328 * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
329 * end of the newly allocated space).
330 *
331 * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
332 * sectors were actually allocated.
333 *
334 * If s->writeback is true, will not fail.
335 */
336static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
337 struct search *s)
338{
339 struct cache_set *c = s->op.c;
340 struct open_bucket *b;
341 BKEY_PADDED(key) alloc;
342 struct closure cl, *w = NULL;
343 unsigned i;
344
345 if (s->writeback) {
346 closure_init_stack(&cl);
347 w = &cl;
348 }
349 222
350 /* 223 /*
351 * We might have to allocate a new bucket, which we can't do with a 224 * If we're looping, might already be waiting on
352 * spinlock held. So if we have to allocate, we drop the lock, allocate 225 * another journal write - can't wait on more than one journal write at
353 * and then retry. KEY_PTRS() indicates whether alloc points to 226 * a time
354 * allocated bucket(s). 227 *
228 * XXX: this looks wrong
355 */ 229 */
230#if 0
231 while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING)
232 closure_sync(&s->cl);
233#endif
356 234
357 bkey_init(&alloc.key); 235 if (!op->replace)
358 spin_lock(&c->data_bucket_lock); 236 journal_ref = bch_journal(op->c, &op->insert_keys,
359 237 op->flush_journal ? cl : NULL);
360 while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) {
361 unsigned watermark = s->op.write_prio
362 ? WATERMARK_MOVINGGC
363 : WATERMARK_NONE;
364
365 spin_unlock(&c->data_bucket_lock);
366
367 if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w))
368 return false;
369 238
370 spin_lock(&c->data_bucket_lock); 239 ret = bch_btree_insert(op->c, &op->insert_keys,
240 journal_ref, replace_key);
241 if (ret == -ESRCH) {
242 op->replace_collision = true;
243 } else if (ret) {
244 op->error = -ENOMEM;
245 op->insert_data_done = true;
371 } 246 }
372 247
373 /* 248 if (journal_ref)
374 * If we had to allocate, we might race and not need to allocate the 249 atomic_dec_bug(journal_ref);
375 * second time we call find_data_bucket(). If we allocated a bucket but
376 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
377 */
378 if (KEY_PTRS(&alloc.key))
379 __bkey_put(c, &alloc.key);
380
381 for (i = 0; i < KEY_PTRS(&b->key); i++)
382 EBUG_ON(ptr_stale(c, &b->key, i));
383 250
384 /* Set up the pointer to the space we're allocating: */ 251 if (!op->insert_data_done)
252 continue_at(cl, bch_data_insert_start, bcache_wq);
385 253
386 for (i = 0; i < KEY_PTRS(&b->key); i++) 254 bch_keylist_free(&op->insert_keys);
387 k->ptr[i] = b->key.ptr[i]; 255 closure_return(cl);
256}
388 257
389 sectors = min(sectors, b->sectors_free); 258static void bch_data_invalidate(struct closure *cl)
259{
260 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
261 struct bio *bio = op->bio;
390 262
391 SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); 263 pr_debug("invalidating %i sectors from %llu",
392 SET_KEY_SIZE(k, sectors); 264 bio_sectors(bio), (uint64_t) bio->bi_sector);
393 SET_KEY_PTRS(k, KEY_PTRS(&b->key));
394 265
395 /* 266 while (bio_sectors(bio)) {
396 * Move b to the end of the lru, and keep track of what this bucket was 267 unsigned sectors = min(bio_sectors(bio),
397 * last used for: 268 1U << (KEY_SIZE_BITS - 1));
398 */
399 list_move_tail(&b->list, &c->data_buckets);
400 bkey_copy_key(&b->key, k);
401 b->last = s->task;
402 269
403 b->sectors_free -= sectors; 270 if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
271 goto out;
404 272
405 for (i = 0; i < KEY_PTRS(&b->key); i++) { 273 bio->bi_sector += sectors;
406 SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); 274 bio->bi_size -= sectors << 9;
407 275
408 atomic_long_add(sectors, 276 bch_keylist_add(&op->insert_keys,
409 &PTR_CACHE(c, &b->key, i)->sectors_written); 277 &KEY(op->inode, bio->bi_sector, sectors));
410 } 278 }
411 279
412 if (b->sectors_free < c->sb.block_size) 280 op->insert_data_done = true;
413 b->sectors_free = 0; 281 bio_put(bio);
414 282out:
415 /* 283 continue_at(cl, bch_data_insert_keys, bcache_wq);
416 * k takes refcounts on the buckets it points to until it's inserted
417 * into the btree, but if we're done with this bucket we just transfer
418 * get_data_bucket()'s refcount.
419 */
420 if (b->sectors_free)
421 for (i = 0; i < KEY_PTRS(&b->key); i++)
422 atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
423
424 spin_unlock(&c->data_bucket_lock);
425 return true;
426} 284}
427 285
428static void bch_insert_data_error(struct closure *cl) 286static void bch_data_insert_error(struct closure *cl)
429{ 287{
430 struct btree_op *op = container_of(cl, struct btree_op, cl); 288 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
431 289
432 /* 290 /*
433 * Our data write just errored, which means we've got a bunch of keys to 291 * Our data write just errored, which means we've got a bunch of keys to
@@ -438,35 +296,34 @@ static void bch_insert_data_error(struct closure *cl)
438 * from the keys we'll accomplish just that. 296 * from the keys we'll accomplish just that.
439 */ 297 */
440 298
441 struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; 299 struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys;
442 300
443 while (src != op->keys.top) { 301 while (src != op->insert_keys.top) {
444 struct bkey *n = bkey_next(src); 302 struct bkey *n = bkey_next(src);
445 303
446 SET_KEY_PTRS(src, 0); 304 SET_KEY_PTRS(src, 0);
447 bkey_copy(dst, src); 305 memmove(dst, src, bkey_bytes(src));
448 306
449 dst = bkey_next(dst); 307 dst = bkey_next(dst);
450 src = n; 308 src = n;
451 } 309 }
452 310
453 op->keys.top = dst; 311 op->insert_keys.top = dst;
454 312
455 bch_journal(cl); 313 bch_data_insert_keys(cl);
456} 314}
457 315
458static void bch_insert_data_endio(struct bio *bio, int error) 316static void bch_data_insert_endio(struct bio *bio, int error)
459{ 317{
460 struct closure *cl = bio->bi_private; 318 struct closure *cl = bio->bi_private;
461 struct btree_op *op = container_of(cl, struct btree_op, cl); 319 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
462 struct search *s = container_of(op, struct search, op);
463 320
464 if (error) { 321 if (error) {
465 /* TODO: We could try to recover from this. */ 322 /* TODO: We could try to recover from this. */
466 if (s->writeback) 323 if (op->writeback)
467 s->error = error; 324 op->error = error;
468 else if (s->write) 325 else if (!op->replace)
469 set_closure_fn(cl, bch_insert_data_error, bcache_wq); 326 set_closure_fn(cl, bch_data_insert_error, bcache_wq);
470 else 327 else
471 set_closure_fn(cl, NULL, NULL); 328 set_closure_fn(cl, NULL, NULL);
472 } 329 }
@@ -474,18 +331,17 @@ static void bch_insert_data_endio(struct bio *bio, int error)
474 bch_bbio_endio(op->c, bio, error, "writing data to cache"); 331 bch_bbio_endio(op->c, bio, error, "writing data to cache");
475} 332}
476 333
477static void bch_insert_data_loop(struct closure *cl) 334static void bch_data_insert_start(struct closure *cl)
478{ 335{
479 struct btree_op *op = container_of(cl, struct btree_op, cl); 336 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
480 struct search *s = container_of(op, struct search, op); 337 struct bio *bio = op->bio, *n;
481 struct bio *bio = op->cache_bio, *n;
482 338
483 if (op->skip) 339 if (op->bypass)
484 return bio_invalidate(cl); 340 return bch_data_invalidate(cl);
485 341
486 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { 342 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
487 set_gc_sectors(op->c); 343 set_gc_sectors(op->c);
488 bch_queue_gc(op->c); 344 wake_up_gc(op->c);
489 } 345 }
490 346
491 /* 347 /*
@@ -497,29 +353,30 @@ static void bch_insert_data_loop(struct closure *cl)
497 do { 353 do {
498 unsigned i; 354 unsigned i;
499 struct bkey *k; 355 struct bkey *k;
500 struct bio_set *split = s->d 356 struct bio_set *split = op->c->bio_split;
501 ? s->d->bio_split : op->c->bio_split;
502 357
503 /* 1 for the device pointer and 1 for the chksum */ 358 /* 1 for the device pointer and 1 for the chksum */
504 if (bch_keylist_realloc(&op->keys, 359 if (bch_keylist_realloc(&op->insert_keys,
505 1 + (op->csum ? 1 : 0), 360 1 + (op->csum ? 1 : 0),
506 op->c)) 361 op->c))
507 continue_at(cl, bch_journal, bcache_wq); 362 continue_at(cl, bch_data_insert_keys, bcache_wq);
508 363
509 k = op->keys.top; 364 k = op->insert_keys.top;
510 bkey_init(k); 365 bkey_init(k);
511 SET_KEY_INODE(k, op->inode); 366 SET_KEY_INODE(k, op->inode);
512 SET_KEY_OFFSET(k, bio->bi_sector); 367 SET_KEY_OFFSET(k, bio->bi_sector);
513 368
514 if (!bch_alloc_sectors(k, bio_sectors(bio), s)) 369 if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
370 op->write_point, op->write_prio,
371 op->writeback))
515 goto err; 372 goto err;
516 373
517 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); 374 n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
518 375
519 n->bi_end_io = bch_insert_data_endio; 376 n->bi_end_io = bch_data_insert_endio;
520 n->bi_private = cl; 377 n->bi_private = cl;
521 378
522 if (s->writeback) { 379 if (op->writeback) {
523 SET_KEY_DIRTY(k, true); 380 SET_KEY_DIRTY(k, true);
524 381
525 for (i = 0; i < KEY_PTRS(k); i++) 382 for (i = 0; i < KEY_PTRS(k); i++)
@@ -532,17 +389,17 @@ static void bch_insert_data_loop(struct closure *cl)
532 bio_csum(n, k); 389 bio_csum(n, k);
533 390
534 trace_bcache_cache_insert(k); 391 trace_bcache_cache_insert(k);
535 bch_keylist_push(&op->keys); 392 bch_keylist_push(&op->insert_keys);
536 393
537 n->bi_rw |= REQ_WRITE; 394 n->bi_rw |= REQ_WRITE;
538 bch_submit_bbio(n, op->c, k, 0); 395 bch_submit_bbio(n, op->c, k, 0);
539 } while (n != bio); 396 } while (n != bio);
540 397
541 op->insert_data_done = true; 398 op->insert_data_done = true;
542 continue_at(cl, bch_journal, bcache_wq); 399 continue_at(cl, bch_data_insert_keys, bcache_wq);
543err: 400err:
544 /* bch_alloc_sectors() blocks if s->writeback = true */ 401 /* bch_alloc_sectors() blocks if s->writeback = true */
545 BUG_ON(s->writeback); 402 BUG_ON(op->writeback);
546 403
547 /* 404 /*
548 * But if it's not a writeback write we'd rather just bail out if 405 * But if it's not a writeback write we'd rather just bail out if
@@ -550,15 +407,15 @@ err:
550 * we might be starving btree writes for gc or something. 407 * we might be starving btree writes for gc or something.
551 */ 408 */
552 409
553 if (s->write) { 410 if (!op->replace) {
554 /* 411 /*
555 * Writethrough write: We can't complete the write until we've 412 * Writethrough write: We can't complete the write until we've
556 * updated the index. But we don't want to delay the write while 413 * updated the index. But we don't want to delay the write while
557 * we wait for buckets to be freed up, so just invalidate the 414 * we wait for buckets to be freed up, so just invalidate the
558 * rest of the write. 415 * rest of the write.
559 */ 416 */
560 op->skip = true; 417 op->bypass = true;
561 return bio_invalidate(cl); 418 return bch_data_invalidate(cl);
562 } else { 419 } else {
563 /* 420 /*
564 * From a cache miss, we can just insert the keys for the data 421 * From a cache miss, we can just insert the keys for the data
@@ -567,15 +424,15 @@ err:
567 op->insert_data_done = true; 424 op->insert_data_done = true;
568 bio_put(bio); 425 bio_put(bio);
569 426
570 if (!bch_keylist_empty(&op->keys)) 427 if (!bch_keylist_empty(&op->insert_keys))
571 continue_at(cl, bch_journal, bcache_wq); 428 continue_at(cl, bch_data_insert_keys, bcache_wq);
572 else 429 else
573 closure_return(cl); 430 closure_return(cl);
574 } 431 }
575} 432}
576 433
577/** 434/**
578 * bch_insert_data - stick some data in the cache 435 * bch_data_insert - stick some data in the cache
579 * 436 *
580 * This is the starting point for any data to end up in a cache device; it could 437 * This is the starting point for any data to end up in a cache device; it could
581 * be from a normal write, or a writeback write, or a write to a flash only 438 * be from a normal write, or a writeback write, or a write to a flash only
@@ -587,56 +444,179 @@ err:
587 * data is written it calls bch_journal, and after the keys have been added to 444 * data is written it calls bch_journal, and after the keys have been added to
588 * the next journal write they're inserted into the btree. 445 * the next journal write they're inserted into the btree.
589 * 446 *
590 * It inserts the data in op->cache_bio; bi_sector is used for the key offset, 447 * It inserts the data in s->cache_bio; bi_sector is used for the key offset,
591 * and op->inode is used for the key inode. 448 * and op->inode is used for the key inode.
592 * 449 *
593 * If op->skip is true, instead of inserting the data it invalidates the region 450 * If s->bypass is true, instead of inserting the data it invalidates the
594 * of the cache represented by op->cache_bio and op->inode. 451 * region of the cache represented by s->cache_bio and op->inode.
595 */ 452 */
596void bch_insert_data(struct closure *cl) 453void bch_data_insert(struct closure *cl)
597{ 454{
598 struct btree_op *op = container_of(cl, struct btree_op, cl); 455 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
456
457 trace_bcache_write(op->bio, op->writeback, op->bypass);
599 458
600 bch_keylist_init(&op->keys); 459 bch_keylist_init(&op->insert_keys);
601 bio_get(op->cache_bio); 460 bio_get(op->bio);
602 bch_insert_data_loop(cl); 461 bch_data_insert_start(cl);
603} 462}
604 463
605void bch_btree_insert_async(struct closure *cl) 464/* Congested? */
465
466unsigned bch_get_congested(struct cache_set *c)
606{ 467{
607 struct btree_op *op = container_of(cl, struct btree_op, cl); 468 int i;
608 struct search *s = container_of(op, struct search, op); 469 long rand;
609 470
610 if (bch_btree_insert(op, op->c)) { 471 if (!c->congested_read_threshold_us &&
611 s->error = -ENOMEM; 472 !c->congested_write_threshold_us)
612 op->insert_data_done = true; 473 return 0;
613 } 474
475 i = (local_clock_us() - c->congested_last_us) / 1024;
476 if (i < 0)
477 return 0;
478
479 i += atomic_read(&c->congested);
480 if (i >= 0)
481 return 0;
614 482
615 if (op->insert_data_done) { 483 i += CONGESTED_MAX;
616 bch_keylist_free(&op->keys); 484
617 closure_return(cl); 485 if (i > 0)
618 } else 486 i = fract_exp_two(i, 6);
619 continue_at(cl, bch_insert_data_loop, bcache_wq); 487
488 rand = get_random_int();
489 i -= bitmap_weight(&rand, BITS_PER_LONG);
490
491 return i > 0 ? i : 1;
620} 492}
621 493
622/* Common code for the make_request functions */ 494static void add_sequential(struct task_struct *t)
495{
496 ewma_add(t->sequential_io_avg,
497 t->sequential_io, 8, 0);
623 498
624static void request_endio(struct bio *bio, int error) 499 t->sequential_io = 0;
500}
501
502static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
625{ 503{
626 struct closure *cl = bio->bi_private; 504 return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
505}
627 506
628 if (error) { 507static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
629 struct search *s = container_of(cl, struct search, cl); 508{
630 s->error = error; 509 struct cache_set *c = dc->disk.c;
631 /* Only cache read errors are recoverable */ 510 unsigned mode = cache_mode(dc, bio);
632 s->recoverable = false; 511 unsigned sectors, congested = bch_get_congested(c);
512 struct task_struct *task = current;
513 struct io *i;
514
515 if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
516 c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
517 (bio->bi_rw & REQ_DISCARD))
518 goto skip;
519
520 if (mode == CACHE_MODE_NONE ||
521 (mode == CACHE_MODE_WRITEAROUND &&
522 (bio->bi_rw & REQ_WRITE)))
523 goto skip;
524
525 if (bio->bi_sector & (c->sb.block_size - 1) ||
526 bio_sectors(bio) & (c->sb.block_size - 1)) {
527 pr_debug("skipping unaligned io");
528 goto skip;
633 } 529 }
634 530
635 bio_put(bio); 531 if (bypass_torture_test(dc)) {
636 closure_put(cl); 532 if ((get_random_int() & 3) == 3)
533 goto skip;
534 else
535 goto rescale;
536 }
537
538 if (!congested && !dc->sequential_cutoff)
539 goto rescale;
540
541 if (!congested &&
542 mode == CACHE_MODE_WRITEBACK &&
543 (bio->bi_rw & REQ_WRITE) &&
544 (bio->bi_rw & REQ_SYNC))
545 goto rescale;
546
547 spin_lock(&dc->io_lock);
548
549 hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
550 if (i->last == bio->bi_sector &&
551 time_before(jiffies, i->jiffies))
552 goto found;
553
554 i = list_first_entry(&dc->io_lru, struct io, lru);
555
556 add_sequential(task);
557 i->sequential = 0;
558found:
559 if (i->sequential + bio->bi_size > i->sequential)
560 i->sequential += bio->bi_size;
561
562 i->last = bio_end_sector(bio);
563 i->jiffies = jiffies + msecs_to_jiffies(5000);
564 task->sequential_io = i->sequential;
565
566 hlist_del(&i->hash);
567 hlist_add_head(&i->hash, iohash(dc, i->last));
568 list_move_tail(&i->lru, &dc->io_lru);
569
570 spin_unlock(&dc->io_lock);
571
572 sectors = max(task->sequential_io,
573 task->sequential_io_avg) >> 9;
574
575 if (dc->sequential_cutoff &&
576 sectors >= dc->sequential_cutoff >> 9) {
577 trace_bcache_bypass_sequential(bio);
578 goto skip;
579 }
580
581 if (congested && sectors >= congested) {
582 trace_bcache_bypass_congested(bio);
583 goto skip;
584 }
585
586rescale:
587 bch_rescale_priorities(c, bio_sectors(bio));
588 return false;
589skip:
590 bch_mark_sectors_bypassed(c, dc, bio_sectors(bio));
591 return true;
637} 592}
638 593
639void bch_cache_read_endio(struct bio *bio, int error) 594/* Cache lookup */
595
596struct search {
597 /* Stack frame for bio_complete */
598 struct closure cl;
599
600 struct bcache_device *d;
601
602 struct bbio bio;
603 struct bio *orig_bio;
604 struct bio *cache_miss;
605
606 unsigned insert_bio_sectors;
607
608 unsigned recoverable:1;
609 unsigned unaligned_bvec:1;
610 unsigned write:1;
611 unsigned read_dirty_data:1;
612
613 unsigned long start_time;
614
615 struct btree_op op;
616 struct data_insert_op iop;
617};
618
619static void bch_cache_read_endio(struct bio *bio, int error)
640{ 620{
641 struct bbio *b = container_of(bio, struct bbio, bio); 621 struct bbio *b = container_of(bio, struct bbio, bio);
642 struct closure *cl = bio->bi_private; 622 struct closure *cl = bio->bi_private;
@@ -650,13 +630,113 @@ void bch_cache_read_endio(struct bio *bio, int error)
650 */ 630 */
651 631
652 if (error) 632 if (error)
653 s->error = error; 633 s->iop.error = error;
654 else if (ptr_stale(s->op.c, &b->key, 0)) { 634 else if (ptr_stale(s->iop.c, &b->key, 0)) {
655 atomic_long_inc(&s->op.c->cache_read_races); 635 atomic_long_inc(&s->iop.c->cache_read_races);
656 s->error = -EINTR; 636 s->iop.error = -EINTR;
657 } 637 }
658 638
659 bch_bbio_endio(s->op.c, bio, error, "reading from cache"); 639 bch_bbio_endio(s->iop.c, bio, error, "reading from cache");
640}
641
642/*
643 * Read from a single key, handling the initial cache miss if the key starts in
644 * the middle of the bio
645 */
646static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
647{
648 struct search *s = container_of(op, struct search, op);
649 struct bio *n, *bio = &s->bio.bio;
650 struct bkey *bio_key;
651 unsigned ptr;
652
653 if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0)
654 return MAP_CONTINUE;
655
656 if (KEY_INODE(k) != s->iop.inode ||
657 KEY_START(k) > bio->bi_sector) {
658 unsigned bio_sectors = bio_sectors(bio);
659 unsigned sectors = KEY_INODE(k) == s->iop.inode
660 ? min_t(uint64_t, INT_MAX,
661 KEY_START(k) - bio->bi_sector)
662 : INT_MAX;
663
664 int ret = s->d->cache_miss(b, s, bio, sectors);
665 if (ret != MAP_CONTINUE)
666 return ret;
667
668 /* if this was a complete miss we shouldn't get here */
669 BUG_ON(bio_sectors <= sectors);
670 }
671
672 if (!KEY_SIZE(k))
673 return MAP_CONTINUE;
674
675 /* XXX: figure out best pointer - for multiple cache devices */
676 ptr = 0;
677
678 PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
679
680 if (KEY_DIRTY(k))
681 s->read_dirty_data = true;
682
683 n = bch_bio_split(bio, min_t(uint64_t, INT_MAX,
684 KEY_OFFSET(k) - bio->bi_sector),
685 GFP_NOIO, s->d->bio_split);
686
687 bio_key = &container_of(n, struct bbio, bio)->key;
688 bch_bkey_copy_single_ptr(bio_key, k, ptr);
689
690 bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key);
691 bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
692
693 n->bi_end_io = bch_cache_read_endio;
694 n->bi_private = &s->cl;
695
696 /*
697 * The bucket we're reading from might be reused while our bio
698 * is in flight, and we could then end up reading the wrong
699 * data.
700 *
701 * We guard against this by checking (in cache_read_endio()) if
702 * the pointer is stale again; if so, we treat it as an error
703 * and reread from the backing device (but we don't pass that
704 * error up anywhere).
705 */
706
707 __bch_submit_bbio(n, b->c);
708 return n == bio ? MAP_DONE : MAP_CONTINUE;
709}
710
711static void cache_lookup(struct closure *cl)
712{
713 struct search *s = container_of(cl, struct search, iop.cl);
714 struct bio *bio = &s->bio.bio;
715
716 int ret = bch_btree_map_keys(&s->op, s->iop.c,
717 &KEY(s->iop.inode, bio->bi_sector, 0),
718 cache_lookup_fn, MAP_END_KEY);
719 if (ret == -EAGAIN)
720 continue_at(cl, cache_lookup, bcache_wq);
721
722 closure_return(cl);
723}
724
725/* Common code for the make_request functions */
726
727static void request_endio(struct bio *bio, int error)
728{
729 struct closure *cl = bio->bi_private;
730
731 if (error) {
732 struct search *s = container_of(cl, struct search, cl);
733 s->iop.error = error;
734 /* Only cache read errors are recoverable */
735 s->recoverable = false;
736 }
737
738 bio_put(bio);
739 closure_put(cl);
660} 740}
661 741
662static void bio_complete(struct search *s) 742static void bio_complete(struct search *s)
@@ -670,8 +750,8 @@ static void bio_complete(struct search *s)
670 part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); 750 part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
671 part_stat_unlock(); 751 part_stat_unlock();
672 752
673 trace_bcache_request_end(s, s->orig_bio); 753 trace_bcache_request_end(s->d, s->orig_bio);
674 bio_endio(s->orig_bio, s->error); 754 bio_endio(s->orig_bio, s->iop.error);
675 s->orig_bio = NULL; 755 s->orig_bio = NULL;
676 } 756 }
677} 757}
@@ -691,8 +771,8 @@ static void search_free(struct closure *cl)
691 struct search *s = container_of(cl, struct search, cl); 771 struct search *s = container_of(cl, struct search, cl);
692 bio_complete(s); 772 bio_complete(s);
693 773
694 if (s->op.cache_bio) 774 if (s->iop.bio)
695 bio_put(s->op.cache_bio); 775 bio_put(s->iop.bio);
696 776
697 if (s->unaligned_bvec) 777 if (s->unaligned_bvec)
698 mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); 778 mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
@@ -703,21 +783,22 @@ static void search_free(struct closure *cl)
703 783
704static struct search *search_alloc(struct bio *bio, struct bcache_device *d) 784static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
705{ 785{
786 struct search *s;
706 struct bio_vec *bv; 787 struct bio_vec *bv;
707 struct search *s = mempool_alloc(d->c->search, GFP_NOIO); 788
708 memset(s, 0, offsetof(struct search, op.keys)); 789 s = mempool_alloc(d->c->search, GFP_NOIO);
790 memset(s, 0, offsetof(struct search, iop.insert_keys));
709 791
710 __closure_init(&s->cl, NULL); 792 __closure_init(&s->cl, NULL);
711 793
712 s->op.inode = d->id; 794 s->iop.inode = d->id;
713 s->op.c = d->c; 795 s->iop.c = d->c;
714 s->d = d; 796 s->d = d;
715 s->op.lock = -1; 797 s->op.lock = -1;
716 s->task = current; 798 s->iop.write_point = hash_long((unsigned long) current, 16);
717 s->orig_bio = bio; 799 s->orig_bio = bio;
718 s->write = (bio->bi_rw & REQ_WRITE) != 0; 800 s->write = (bio->bi_rw & REQ_WRITE) != 0;
719 s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; 801 s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
720 s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0;
721 s->recoverable = 1; 802 s->recoverable = 1;
722 s->start_time = jiffies; 803 s->start_time = jiffies;
723 do_bio_hook(s); 804 do_bio_hook(s);
@@ -734,18 +815,6 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
734 return s; 815 return s;
735} 816}
736 817
737static void btree_read_async(struct closure *cl)
738{
739 struct btree_op *op = container_of(cl, struct btree_op, cl);
740
741 int ret = btree_root(search_recurse, op->c, op);
742
743 if (ret == -EAGAIN)
744 continue_at(cl, btree_read_async, bcache_wq);
745
746 closure_return(cl);
747}
748
749/* Cached devices */ 818/* Cached devices */
750 819
751static void cached_dev_bio_complete(struct closure *cl) 820static void cached_dev_bio_complete(struct closure *cl)
@@ -759,27 +828,28 @@ static void cached_dev_bio_complete(struct closure *cl)
759 828
760/* Process reads */ 829/* Process reads */
761 830
762static void cached_dev_read_complete(struct closure *cl) 831static void cached_dev_cache_miss_done(struct closure *cl)
763{ 832{
764 struct search *s = container_of(cl, struct search, cl); 833 struct search *s = container_of(cl, struct search, cl);
765 834
766 if (s->op.insert_collision) 835 if (s->iop.replace_collision)
767 bch_mark_cache_miss_collision(s); 836 bch_mark_cache_miss_collision(s->iop.c, s->d);
768 837
769 if (s->op.cache_bio) { 838 if (s->iop.bio) {
770 int i; 839 int i;
771 struct bio_vec *bv; 840 struct bio_vec *bv;
772 841
773 __bio_for_each_segment(bv, s->op.cache_bio, i, 0) 842 bio_for_each_segment_all(bv, s->iop.bio, i)
774 __free_page(bv->bv_page); 843 __free_page(bv->bv_page);
775 } 844 }
776 845
777 cached_dev_bio_complete(cl); 846 cached_dev_bio_complete(cl);
778} 847}
779 848
780static void request_read_error(struct closure *cl) 849static void cached_dev_read_error(struct closure *cl)
781{ 850{
782 struct search *s = container_of(cl, struct search, cl); 851 struct search *s = container_of(cl, struct search, cl);
852 struct bio *bio = &s->bio.bio;
783 struct bio_vec *bv; 853 struct bio_vec *bv;
784 int i; 854 int i;
785 855
@@ -787,7 +857,7 @@ static void request_read_error(struct closure *cl)
787 /* Retry from the backing device: */ 857 /* Retry from the backing device: */
788 trace_bcache_read_retry(s->orig_bio); 858 trace_bcache_read_retry(s->orig_bio);
789 859
790 s->error = 0; 860 s->iop.error = 0;
791 bv = s->bio.bio.bi_io_vec; 861 bv = s->bio.bio.bi_io_vec;
792 do_bio_hook(s); 862 do_bio_hook(s);
793 s->bio.bio.bi_io_vec = bv; 863 s->bio.bio.bi_io_vec = bv;
@@ -803,146 +873,148 @@ static void request_read_error(struct closure *cl)
803 873
804 /* XXX: invalidate cache */ 874 /* XXX: invalidate cache */
805 875
806 closure_bio_submit(&s->bio.bio, &s->cl, s->d); 876 closure_bio_submit(bio, cl, s->d);
807 } 877 }
808 878
809 continue_at(cl, cached_dev_read_complete, NULL); 879 continue_at(cl, cached_dev_cache_miss_done, NULL);
810} 880}
811 881
812static void request_read_done(struct closure *cl) 882static void cached_dev_read_done(struct closure *cl)
813{ 883{
814 struct search *s = container_of(cl, struct search, cl); 884 struct search *s = container_of(cl, struct search, cl);
815 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 885 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
816 886
817 /* 887 /*
818 * s->cache_bio != NULL implies that we had a cache miss; cache_bio now 888 * We had a cache miss; cache_bio now contains data ready to be inserted
819 * contains data ready to be inserted into the cache. 889 * into the cache.
820 * 890 *
821 * First, we copy the data we just read from cache_bio's bounce buffers 891 * First, we copy the data we just read from cache_bio's bounce buffers
822 * to the buffers the original bio pointed to: 892 * to the buffers the original bio pointed to:
823 */ 893 */
824 894
825 if (s->op.cache_bio) { 895 if (s->iop.bio) {
826 bio_reset(s->op.cache_bio); 896 bio_reset(s->iop.bio);
827 s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; 897 s->iop.bio->bi_sector = s->cache_miss->bi_sector;
828 s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; 898 s->iop.bio->bi_bdev = s->cache_miss->bi_bdev;
829 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; 899 s->iop.bio->bi_size = s->insert_bio_sectors << 9;
830 bch_bio_map(s->op.cache_bio, NULL); 900 bch_bio_map(s->iop.bio, NULL);
831 901
832 bio_copy_data(s->cache_miss, s->op.cache_bio); 902 bio_copy_data(s->cache_miss, s->iop.bio);
833 903
834 bio_put(s->cache_miss); 904 bio_put(s->cache_miss);
835 s->cache_miss = NULL; 905 s->cache_miss = NULL;
836 } 906 }
837 907
838 if (verify(dc, &s->bio.bio) && s->recoverable) 908 if (verify(dc, &s->bio.bio) && s->recoverable &&
839 bch_data_verify(s); 909 !s->unaligned_bvec && !s->read_dirty_data)
910 bch_data_verify(dc, s->orig_bio);
840 911
841 bio_complete(s); 912 bio_complete(s);
842 913
843 if (s->op.cache_bio && 914 if (s->iop.bio &&
844 !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { 915 !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) {
845 s->op.type = BTREE_REPLACE; 916 BUG_ON(!s->iop.replace);
846 closure_call(&s->op.cl, bch_insert_data, NULL, cl); 917 closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
847 } 918 }
848 919
849 continue_at(cl, cached_dev_read_complete, NULL); 920 continue_at(cl, cached_dev_cache_miss_done, NULL);
850} 921}
851 922
852static void request_read_done_bh(struct closure *cl) 923static void cached_dev_read_done_bh(struct closure *cl)
853{ 924{
854 struct search *s = container_of(cl, struct search, cl); 925 struct search *s = container_of(cl, struct search, cl);
855 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 926 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
856 927
857 bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); 928 bch_mark_cache_accounting(s->iop.c, s->d,
858 trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); 929 !s->cache_miss, s->iop.bypass);
930 trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
859 931
860 if (s->error) 932 if (s->iop.error)
861 continue_at_nobarrier(cl, request_read_error, bcache_wq); 933 continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
862 else if (s->op.cache_bio || verify(dc, &s->bio.bio)) 934 else if (s->iop.bio || verify(dc, &s->bio.bio))
863 continue_at_nobarrier(cl, request_read_done, bcache_wq); 935 continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
864 else 936 else
865 continue_at_nobarrier(cl, cached_dev_read_complete, NULL); 937 continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
866} 938}
867 939
868static int cached_dev_cache_miss(struct btree *b, struct search *s, 940static int cached_dev_cache_miss(struct btree *b, struct search *s,
869 struct bio *bio, unsigned sectors) 941 struct bio *bio, unsigned sectors)
870{ 942{
871 int ret = 0; 943 int ret = MAP_CONTINUE;
872 unsigned reada; 944 unsigned reada = 0;
873 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 945 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
874 struct bio *miss; 946 struct bio *miss, *cache_bio;
875
876 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
877 if (miss == bio)
878 s->op.lookup_done = true;
879 947
880 miss->bi_end_io = request_endio; 948 if (s->cache_miss || s->iop.bypass) {
881 miss->bi_private = &s->cl; 949 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
882 950 ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
883 if (s->cache_miss || s->op.skip)
884 goto out_submit; 951 goto out_submit;
885
886 if (miss != bio ||
887 (bio->bi_rw & REQ_RAHEAD) ||
888 (bio->bi_rw & REQ_META) ||
889 s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA)
890 reada = 0;
891 else {
892 reada = min(dc->readahead >> 9,
893 sectors - bio_sectors(miss));
894
895 if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev))
896 reada = bdev_sectors(miss->bi_bdev) -
897 bio_end_sector(miss);
898 } 952 }
899 953
900 s->cache_bio_sectors = bio_sectors(miss) + reada; 954 if (!(bio->bi_rw & REQ_RAHEAD) &&
901 s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, 955 !(bio->bi_rw & REQ_META) &&
902 DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), 956 s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
903 dc->disk.bio_split); 957 reada = min_t(sector_t, dc->readahead >> 9,
958 bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
904 959
905 if (!s->op.cache_bio) 960 s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
906 goto out_submit;
907 961
908 s->op.cache_bio->bi_sector = miss->bi_sector; 962 s->iop.replace_key = KEY(s->iop.inode,
909 s->op.cache_bio->bi_bdev = miss->bi_bdev; 963 bio->bi_sector + s->insert_bio_sectors,
910 s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; 964 s->insert_bio_sectors);
911 965
912 s->op.cache_bio->bi_end_io = request_endio; 966 ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key);
913 s->op.cache_bio->bi_private = &s->cl; 967 if (ret)
968 return ret;
969
970 s->iop.replace = true;
971
972 miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
914 973
915 /* btree_search_recurse()'s btree iterator is no good anymore */ 974 /* btree_search_recurse()'s btree iterator is no good anymore */
916 ret = -EINTR; 975 ret = miss == bio ? MAP_DONE : -EINTR;
917 if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) 976
918 goto out_put; 977 cache_bio = bio_alloc_bioset(GFP_NOWAIT,
978 DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS),
979 dc->disk.bio_split);
980 if (!cache_bio)
981 goto out_submit;
982
983 cache_bio->bi_sector = miss->bi_sector;
984 cache_bio->bi_bdev = miss->bi_bdev;
985 cache_bio->bi_size = s->insert_bio_sectors << 9;
986
987 cache_bio->bi_end_io = request_endio;
988 cache_bio->bi_private = &s->cl;
919 989
920 bch_bio_map(s->op.cache_bio, NULL); 990 bch_bio_map(cache_bio, NULL);
921 if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) 991 if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
922 goto out_put; 992 goto out_put;
923 993
924 s->cache_miss = miss; 994 if (reada)
925 bio_get(s->op.cache_bio); 995 bch_mark_cache_readahead(s->iop.c, s->d);
926 996
927 closure_bio_submit(s->op.cache_bio, &s->cl, s->d); 997 s->cache_miss = miss;
998 s->iop.bio = cache_bio;
999 bio_get(cache_bio);
1000 closure_bio_submit(cache_bio, &s->cl, s->d);
928 1001
929 return ret; 1002 return ret;
930out_put: 1003out_put:
931 bio_put(s->op.cache_bio); 1004 bio_put(cache_bio);
932 s->op.cache_bio = NULL;
933out_submit: 1005out_submit:
1006 miss->bi_end_io = request_endio;
1007 miss->bi_private = &s->cl;
934 closure_bio_submit(miss, &s->cl, s->d); 1008 closure_bio_submit(miss, &s->cl, s->d);
935 return ret; 1009 return ret;
936} 1010}
937 1011
938static void request_read(struct cached_dev *dc, struct search *s) 1012static void cached_dev_read(struct cached_dev *dc, struct search *s)
939{ 1013{
940 struct closure *cl = &s->cl; 1014 struct closure *cl = &s->cl;
941 1015
942 check_should_skip(dc, s); 1016 closure_call(&s->iop.cl, cache_lookup, NULL, cl);
943 closure_call(&s->op.cl, btree_read_async, NULL, cl); 1017 continue_at(cl, cached_dev_read_done_bh, NULL);
944
945 continue_at(cl, request_read_done_bh, NULL);
946} 1018}
947 1019
948/* Process writes */ 1020/* Process writes */
@@ -956,47 +1028,52 @@ static void cached_dev_write_complete(struct closure *cl)
956 cached_dev_bio_complete(cl); 1028 cached_dev_bio_complete(cl);
957} 1029}
958 1030
959static void request_write(struct cached_dev *dc, struct search *s) 1031static void cached_dev_write(struct cached_dev *dc, struct search *s)
960{ 1032{
961 struct closure *cl = &s->cl; 1033 struct closure *cl = &s->cl;
962 struct bio *bio = &s->bio.bio; 1034 struct bio *bio = &s->bio.bio;
963 struct bkey start, end; 1035 struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0);
964 start = KEY(dc->disk.id, bio->bi_sector, 0); 1036 struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
965 end = KEY(dc->disk.id, bio_end_sector(bio), 0);
966 1037
967 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); 1038 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end);
968 1039
969 check_should_skip(dc, s);
970 down_read_non_owner(&dc->writeback_lock); 1040 down_read_non_owner(&dc->writeback_lock);
971
972 if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { 1041 if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
973 s->op.skip = false; 1042 /*
974 s->writeback = true; 1043 * We overlap with some dirty data undergoing background
1044 * writeback, force this write to writeback
1045 */
1046 s->iop.bypass = false;
1047 s->iop.writeback = true;
975 } 1048 }
976 1049
1050 /*
1051 * Discards aren't _required_ to do anything, so skipping if
1052 * check_overlapping returned true is ok
1053 *
1054 * But check_overlapping drops dirty keys for which io hasn't started,
1055 * so we still want to call it.
1056 */
977 if (bio->bi_rw & REQ_DISCARD) 1057 if (bio->bi_rw & REQ_DISCARD)
978 goto skip; 1058 s->iop.bypass = true;
979 1059
980 if (should_writeback(dc, s->orig_bio, 1060 if (should_writeback(dc, s->orig_bio,
981 cache_mode(dc, bio), 1061 cache_mode(dc, bio),
982 s->op.skip)) { 1062 s->iop.bypass)) {
983 s->op.skip = false; 1063 s->iop.bypass = false;
984 s->writeback = true; 1064 s->iop.writeback = true;
985 } 1065 }
986 1066
987 if (s->op.skip) 1067 if (s->iop.bypass) {
988 goto skip; 1068 s->iop.bio = s->orig_bio;
989 1069 bio_get(s->iop.bio);
990 trace_bcache_write(s->orig_bio, s->writeback, s->op.skip);
991 1070
992 if (!s->writeback) { 1071 if (!(bio->bi_rw & REQ_DISCARD) ||
993 s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, 1072 blk_queue_discard(bdev_get_queue(dc->bdev)))
994 dc->disk.bio_split); 1073 closure_bio_submit(bio, cl, s->d);
995 1074 } else if (s->iop.writeback) {
996 closure_bio_submit(bio, cl, s->d);
997 } else {
998 bch_writeback_add(dc); 1075 bch_writeback_add(dc);
999 s->op.cache_bio = bio; 1076 s->iop.bio = bio;
1000 1077
1001 if (bio->bi_rw & REQ_FLUSH) { 1078 if (bio->bi_rw & REQ_FLUSH) {
1002 /* Also need to send a flush to the backing device */ 1079 /* Also need to send a flush to the backing device */
@@ -1010,36 +1087,26 @@ static void request_write(struct cached_dev *dc, struct search *s)
1010 1087
1011 closure_bio_submit(flush, cl, s->d); 1088 closure_bio_submit(flush, cl, s->d);
1012 } 1089 }
1013 } 1090 } else {
1014out: 1091 s->iop.bio = bio_clone_bioset(bio, GFP_NOIO,
1015 closure_call(&s->op.cl, bch_insert_data, NULL, cl); 1092 dc->disk.bio_split);
1016 continue_at(cl, cached_dev_write_complete, NULL);
1017skip:
1018 s->op.skip = true;
1019 s->op.cache_bio = s->orig_bio;
1020 bio_get(s->op.cache_bio);
1021 1093
1022 if ((bio->bi_rw & REQ_DISCARD) && 1094 closure_bio_submit(bio, cl, s->d);
1023 !blk_queue_discard(bdev_get_queue(dc->bdev))) 1095 }
1024 goto out;
1025 1096
1026 closure_bio_submit(bio, cl, s->d); 1097 closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
1027 goto out; 1098 continue_at(cl, cached_dev_write_complete, NULL);
1028} 1099}
1029 1100
1030static void request_nodata(struct cached_dev *dc, struct search *s) 1101static void cached_dev_nodata(struct closure *cl)
1031{ 1102{
1032 struct closure *cl = &s->cl; 1103 struct search *s = container_of(cl, struct search, cl);
1033 struct bio *bio = &s->bio.bio; 1104 struct bio *bio = &s->bio.bio;
1034 1105
1035 if (bio->bi_rw & REQ_DISCARD) { 1106 if (s->iop.flush_journal)
1036 request_write(dc, s); 1107 bch_journal_meta(s->iop.c, cl);
1037 return;
1038 }
1039
1040 if (s->op.flush_journal)
1041 bch_journal_meta(s->op.c, cl);
1042 1108
1109 /* If it's a flush, we send the flush to the backing device too */
1043 closure_bio_submit(bio, cl, s->d); 1110 closure_bio_submit(bio, cl, s->d);
1044 1111
1045 continue_at(cl, cached_dev_bio_complete, NULL); 1112 continue_at(cl, cached_dev_bio_complete, NULL);
@@ -1047,134 +1114,6 @@ static void request_nodata(struct cached_dev *dc, struct search *s)
1047 1114
1048/* Cached devices - read & write stuff */ 1115/* Cached devices - read & write stuff */
1049 1116
1050unsigned bch_get_congested(struct cache_set *c)
1051{
1052 int i;
1053 long rand;
1054
1055 if (!c->congested_read_threshold_us &&
1056 !c->congested_write_threshold_us)
1057 return 0;
1058
1059 i = (local_clock_us() - c->congested_last_us) / 1024;
1060 if (i < 0)
1061 return 0;
1062
1063 i += atomic_read(&c->congested);
1064 if (i >= 0)
1065 return 0;
1066
1067 i += CONGESTED_MAX;
1068
1069 if (i > 0)
1070 i = fract_exp_two(i, 6);
1071
1072 rand = get_random_int();
1073 i -= bitmap_weight(&rand, BITS_PER_LONG);
1074
1075 return i > 0 ? i : 1;
1076}
1077
1078static void add_sequential(struct task_struct *t)
1079{
1080 ewma_add(t->sequential_io_avg,
1081 t->sequential_io, 8, 0);
1082
1083 t->sequential_io = 0;
1084}
1085
1086static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
1087{
1088 return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
1089}
1090
1091static void check_should_skip(struct cached_dev *dc, struct search *s)
1092{
1093 struct cache_set *c = s->op.c;
1094 struct bio *bio = &s->bio.bio;
1095 unsigned mode = cache_mode(dc, bio);
1096 unsigned sectors, congested = bch_get_congested(c);
1097
1098 if (atomic_read(&dc->disk.detaching) ||
1099 c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
1100 (bio->bi_rw & REQ_DISCARD))
1101 goto skip;
1102
1103 if (mode == CACHE_MODE_NONE ||
1104 (mode == CACHE_MODE_WRITEAROUND &&
1105 (bio->bi_rw & REQ_WRITE)))
1106 goto skip;
1107
1108 if (bio->bi_sector & (c->sb.block_size - 1) ||
1109 bio_sectors(bio) & (c->sb.block_size - 1)) {
1110 pr_debug("skipping unaligned io");
1111 goto skip;
1112 }
1113
1114 if (!congested && !dc->sequential_cutoff)
1115 goto rescale;
1116
1117 if (!congested &&
1118 mode == CACHE_MODE_WRITEBACK &&
1119 (bio->bi_rw & REQ_WRITE) &&
1120 (bio->bi_rw & REQ_SYNC))
1121 goto rescale;
1122
1123 if (dc->sequential_merge) {
1124 struct io *i;
1125
1126 spin_lock(&dc->io_lock);
1127
1128 hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
1129 if (i->last == bio->bi_sector &&
1130 time_before(jiffies, i->jiffies))
1131 goto found;
1132
1133 i = list_first_entry(&dc->io_lru, struct io, lru);
1134
1135 add_sequential(s->task);
1136 i->sequential = 0;
1137found:
1138 if (i->sequential + bio->bi_size > i->sequential)
1139 i->sequential += bio->bi_size;
1140
1141 i->last = bio_end_sector(bio);
1142 i->jiffies = jiffies + msecs_to_jiffies(5000);
1143 s->task->sequential_io = i->sequential;
1144
1145 hlist_del(&i->hash);
1146 hlist_add_head(&i->hash, iohash(dc, i->last));
1147 list_move_tail(&i->lru, &dc->io_lru);
1148
1149 spin_unlock(&dc->io_lock);
1150 } else {
1151 s->task->sequential_io = bio->bi_size;
1152
1153 add_sequential(s->task);
1154 }
1155
1156 sectors = max(s->task->sequential_io,
1157 s->task->sequential_io_avg) >> 9;
1158
1159 if (dc->sequential_cutoff &&
1160 sectors >= dc->sequential_cutoff >> 9) {
1161 trace_bcache_bypass_sequential(s->orig_bio);
1162 goto skip;
1163 }
1164
1165 if (congested && sectors >= congested) {
1166 trace_bcache_bypass_congested(s->orig_bio);
1167 goto skip;
1168 }
1169
1170rescale:
1171 bch_rescale_priorities(c, bio_sectors(bio));
1172 return;
1173skip:
1174 bch_mark_sectors_bypassed(s, bio_sectors(bio));
1175 s->op.skip = true;
1176}
1177
1178static void cached_dev_make_request(struct request_queue *q, struct bio *bio) 1117static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
1179{ 1118{
1180 struct search *s; 1119 struct search *s;
@@ -1192,14 +1131,24 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
1192 1131
1193 if (cached_dev_get(dc)) { 1132 if (cached_dev_get(dc)) {
1194 s = search_alloc(bio, d); 1133 s = search_alloc(bio, d);
1195 trace_bcache_request_start(s, bio); 1134 trace_bcache_request_start(s->d, bio);
1196 1135
1197 if (!bio_has_data(bio)) 1136 if (!bio->bi_size) {
1198 request_nodata(dc, s); 1137 /*
1199 else if (rw) 1138 * can't call bch_journal_meta from under
1200 request_write(dc, s); 1139 * generic_make_request
1201 else 1140 */
1202 request_read(dc, s); 1141 continue_at_nobarrier(&s->cl,
1142 cached_dev_nodata,
1143 bcache_wq);
1144 } else {
1145 s->iop.bypass = check_should_bypass(dc, bio);
1146
1147 if (rw)
1148 cached_dev_write(dc, s);
1149 else
1150 cached_dev_read(dc, s);
1151 }
1203 } else { 1152 } else {
1204 if ((bio->bi_rw & REQ_DISCARD) && 1153 if ((bio->bi_rw & REQ_DISCARD) &&
1205 !blk_queue_discard(bdev_get_queue(dc->bdev))) 1154 !blk_queue_discard(bdev_get_queue(dc->bdev)))
@@ -1274,9 +1223,19 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s,
1274 bio_advance(bio, min(sectors << 9, bio->bi_size)); 1223 bio_advance(bio, min(sectors << 9, bio->bi_size));
1275 1224
1276 if (!bio->bi_size) 1225 if (!bio->bi_size)
1277 s->op.lookup_done = true; 1226 return MAP_DONE;
1278 1227
1279 return 0; 1228 return MAP_CONTINUE;
1229}
1230
1231static void flash_dev_nodata(struct closure *cl)
1232{
1233 struct search *s = container_of(cl, struct search, cl);
1234
1235 if (s->iop.flush_journal)
1236 bch_journal_meta(s->iop.c, cl);
1237
1238 continue_at(cl, search_free, NULL);
1280} 1239}
1281 1240
1282static void flash_dev_make_request(struct request_queue *q, struct bio *bio) 1241static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
@@ -1295,23 +1254,28 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
1295 cl = &s->cl; 1254 cl = &s->cl;
1296 bio = &s->bio.bio; 1255 bio = &s->bio.bio;
1297 1256
1298 trace_bcache_request_start(s, bio); 1257 trace_bcache_request_start(s->d, bio);
1299 1258
1300 if (bio_has_data(bio) && !rw) { 1259 if (!bio->bi_size) {
1301 closure_call(&s->op.cl, btree_read_async, NULL, cl); 1260 /*
1302 } else if (bio_has_data(bio) || s->op.skip) { 1261 * can't call bch_journal_meta from under
1303 bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, 1262 * generic_make_request
1263 */
1264 continue_at_nobarrier(&s->cl,
1265 flash_dev_nodata,
1266 bcache_wq);
1267 } else if (rw) {
1268 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
1304 &KEY(d->id, bio->bi_sector, 0), 1269 &KEY(d->id, bio->bi_sector, 0),
1305 &KEY(d->id, bio_end_sector(bio), 0)); 1270 &KEY(d->id, bio_end_sector(bio), 0));
1306 1271
1307 s->writeback = true; 1272 s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0;
1308 s->op.cache_bio = bio; 1273 s->iop.writeback = true;
1274 s->iop.bio = bio;
1309 1275
1310 closure_call(&s->op.cl, bch_insert_data, NULL, cl); 1276 closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
1311 } else { 1277 } else {
1312 /* No data - probably a cache flush */ 1278 closure_call(&s->iop.cl, cache_lookup, NULL, cl);
1313 if (s->op.flush_journal)
1314 bch_journal_meta(s->op.c, cl);
1315 } 1279 }
1316 1280
1317 continue_at(cl, search_free, NULL); 1281 continue_at(cl, search_free, NULL);
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 57dc4784f4f4..2cd65bf073c2 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -3,40 +3,33 @@
3 3
4#include <linux/cgroup.h> 4#include <linux/cgroup.h>
5 5
6struct search { 6struct data_insert_op {
7 /* Stack frame for bio_complete */
8 struct closure cl; 7 struct closure cl;
8 struct cache_set *c;
9 struct bio *bio;
9 10
10 struct bcache_device *d; 11 unsigned inode;
11 struct task_struct *task; 12 uint16_t write_point;
12 13 uint16_t write_prio;
13 struct bbio bio; 14 short error;
14 struct bio *orig_bio;
15 struct bio *cache_miss;
16 unsigned cache_bio_sectors;
17
18 unsigned recoverable:1;
19 unsigned unaligned_bvec:1;
20 15
21 unsigned write:1; 16 unsigned bypass:1;
22 unsigned writeback:1; 17 unsigned writeback:1;
18 unsigned flush_journal:1;
19 unsigned csum:1;
23 20
24 /* IO error returned to s->bio */ 21 unsigned replace:1;
25 short error; 22 unsigned replace_collision:1;
26 unsigned long start_time; 23
24 unsigned insert_data_done:1;
27 25
28 /* Anything past op->keys won't get zeroed in do_bio_hook */ 26 /* Anything past this point won't get zeroed in search_alloc() */
29 struct btree_op op; 27 struct keylist insert_keys;
28 BKEY_PADDED(replace_key);
30}; 29};
31 30
32void bch_cache_read_endio(struct bio *, int);
33unsigned bch_get_congested(struct cache_set *); 31unsigned bch_get_congested(struct cache_set *);
34void bch_insert_data(struct closure *cl); 32void bch_data_insert(struct closure *cl);
35void bch_btree_insert_async(struct closure *);
36void bch_cache_read_endio(struct bio *, int);
37
38void bch_open_buckets_free(struct cache_set *);
39int bch_open_buckets_alloc(struct cache_set *);
40 33
41void bch_cached_dev_request_init(struct cached_dev *dc); 34void bch_cached_dev_request_init(struct cached_dev *dc);
42void bch_flash_dev_request_init(struct bcache_device *d); 35void bch_flash_dev_request_init(struct bcache_device *d);
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index b8730e714d69..84d0782f702e 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -7,7 +7,6 @@
7#include "bcache.h" 7#include "bcache.h"
8#include "stats.h" 8#include "stats.h"
9#include "btree.h" 9#include "btree.h"
10#include "request.h"
11#include "sysfs.h" 10#include "sysfs.h"
12 11
13/* 12/*
@@ -196,35 +195,36 @@ static void mark_cache_stats(struct cache_stat_collector *stats,
196 atomic_inc(&stats->cache_bypass_misses); 195 atomic_inc(&stats->cache_bypass_misses);
197} 196}
198 197
199void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass) 198void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
199 bool hit, bool bypass)
200{ 200{
201 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 201 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
202 mark_cache_stats(&dc->accounting.collector, hit, bypass); 202 mark_cache_stats(&dc->accounting.collector, hit, bypass);
203 mark_cache_stats(&s->op.c->accounting.collector, hit, bypass); 203 mark_cache_stats(&c->accounting.collector, hit, bypass);
204#ifdef CONFIG_CGROUP_BCACHE 204#ifdef CONFIG_CGROUP_BCACHE
205 mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); 205 mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
206#endif 206#endif
207} 207}
208 208
209void bch_mark_cache_readahead(struct search *s) 209void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
210{ 210{
211 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 211 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
212 atomic_inc(&dc->accounting.collector.cache_readaheads); 212 atomic_inc(&dc->accounting.collector.cache_readaheads);
213 atomic_inc(&s->op.c->accounting.collector.cache_readaheads); 213 atomic_inc(&c->accounting.collector.cache_readaheads);
214} 214}
215 215
216void bch_mark_cache_miss_collision(struct search *s) 216void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d)
217{ 217{
218 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 218 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
219 atomic_inc(&dc->accounting.collector.cache_miss_collisions); 219 atomic_inc(&dc->accounting.collector.cache_miss_collisions);
220 atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions); 220 atomic_inc(&c->accounting.collector.cache_miss_collisions);
221} 221}
222 222
223void bch_mark_sectors_bypassed(struct search *s, int sectors) 223void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc,
224 int sectors)
224{ 225{
225 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
226 atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); 226 atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
227 atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed); 227 atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
228} 228}
229 229
230void bch_cache_accounting_init(struct cache_accounting *acc, 230void bch_cache_accounting_init(struct cache_accounting *acc,
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
index c7c7a8fd29fe..adbff141c887 100644
--- a/drivers/md/bcache/stats.h
+++ b/drivers/md/bcache/stats.h
@@ -38,7 +38,9 @@ struct cache_accounting {
38 struct cache_stats day; 38 struct cache_stats day;
39}; 39};
40 40
41struct search; 41struct cache_set;
42struct cached_dev;
43struct bcache_device;
42 44
43void bch_cache_accounting_init(struct cache_accounting *acc, 45void bch_cache_accounting_init(struct cache_accounting *acc,
44 struct closure *parent); 46 struct closure *parent);
@@ -50,9 +52,10 @@ void bch_cache_accounting_clear(struct cache_accounting *acc);
50 52
51void bch_cache_accounting_destroy(struct cache_accounting *acc); 53void bch_cache_accounting_destroy(struct cache_accounting *acc);
52 54
53void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass); 55void bch_mark_cache_accounting(struct cache_set *, struct bcache_device *,
54void bch_mark_cache_readahead(struct search *s); 56 bool, bool);
55void bch_mark_cache_miss_collision(struct search *s); 57void bch_mark_cache_readahead(struct cache_set *, struct bcache_device *);
56void bch_mark_sectors_bypassed(struct search *s, int sectors); 58void bch_mark_cache_miss_collision(struct cache_set *, struct bcache_device *);
59void bch_mark_sectors_bypassed(struct cache_set *, struct cached_dev *, int);
57 60
58#endif /* _BCACHE_STATS_H_ */ 61#endif /* _BCACHE_STATS_H_ */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 547c4c57b052..dec15cd2d797 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -16,6 +16,7 @@
16#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
17#include <linux/debugfs.h> 17#include <linux/debugfs.h>
18#include <linux/genhd.h> 18#include <linux/genhd.h>
19#include <linux/idr.h>
19#include <linux/kthread.h> 20#include <linux/kthread.h>
20#include <linux/module.h> 21#include <linux/module.h>
21#include <linux/random.h> 22#include <linux/random.h>
@@ -45,21 +46,13 @@ const char * const bch_cache_modes[] = {
45 NULL 46 NULL
46}; 47};
47 48
48struct uuid_entry_v0 {
49 uint8_t uuid[16];
50 uint8_t label[32];
51 uint32_t first_reg;
52 uint32_t last_reg;
53 uint32_t invalidated;
54 uint32_t pad;
55};
56
57static struct kobject *bcache_kobj; 49static struct kobject *bcache_kobj;
58struct mutex bch_register_lock; 50struct mutex bch_register_lock;
59LIST_HEAD(bch_cache_sets); 51LIST_HEAD(bch_cache_sets);
60static LIST_HEAD(uncached_devices); 52static LIST_HEAD(uncached_devices);
61 53
62static int bcache_major, bcache_minor; 54static int bcache_major;
55static DEFINE_IDA(bcache_minor);
63static wait_queue_head_t unregister_wait; 56static wait_queue_head_t unregister_wait;
64struct workqueue_struct *bcache_wq; 57struct workqueue_struct *bcache_wq;
65 58
@@ -382,7 +375,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
382{ 375{
383 struct bkey *k = &j->uuid_bucket; 376 struct bkey *k = &j->uuid_bucket;
384 377
385 if (__bch_ptr_invalid(c, 1, k)) 378 if (bch_btree_ptr_invalid(c, k))
386 return "bad uuid pointer"; 379 return "bad uuid pointer";
387 380
388 bkey_copy(&c->uuid_bucket, k); 381 bkey_copy(&c->uuid_bucket, k);
@@ -427,7 +420,7 @@ static int __uuid_write(struct cache_set *c)
427 420
428 lockdep_assert_held(&bch_register_lock); 421 lockdep_assert_held(&bch_register_lock);
429 422
430 if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl)) 423 if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
431 return 1; 424 return 1;
432 425
433 SET_KEY_SIZE(&k.key, c->sb.bucket_size); 426 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
@@ -435,7 +428,7 @@ static int __uuid_write(struct cache_set *c)
435 closure_sync(&cl); 428 closure_sync(&cl);
436 429
437 bkey_copy(&c->uuid_bucket, &k.key); 430 bkey_copy(&c->uuid_bucket, &k.key);
438 __bkey_put(c, &k.key); 431 bkey_put(c, &k.key);
439 return 0; 432 return 0;
440} 433}
441 434
@@ -562,10 +555,10 @@ void bch_prio_write(struct cache *ca)
562 } 555 }
563 556
564 p->next_bucket = ca->prio_buckets[i + 1]; 557 p->next_bucket = ca->prio_buckets[i + 1];
565 p->magic = pset_magic(ca); 558 p->magic = pset_magic(&ca->sb);
566 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); 559 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
567 560
568 bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl); 561 bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true);
569 BUG_ON(bucket == -1); 562 BUG_ON(bucket == -1);
570 563
571 mutex_unlock(&ca->set->bucket_lock); 564 mutex_unlock(&ca->set->bucket_lock);
@@ -613,7 +606,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
613 if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) 606 if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
614 pr_warn("bad csum reading priorities"); 607 pr_warn("bad csum reading priorities");
615 608
616 if (p->magic != pset_magic(ca)) 609 if (p->magic != pset_magic(&ca->sb))
617 pr_warn("bad magic reading priorities"); 610 pr_warn("bad magic reading priorities");
618 611
619 bucket = p->next_bucket; 612 bucket = p->next_bucket;
@@ -630,7 +623,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
630static int open_dev(struct block_device *b, fmode_t mode) 623static int open_dev(struct block_device *b, fmode_t mode)
631{ 624{
632 struct bcache_device *d = b->bd_disk->private_data; 625 struct bcache_device *d = b->bd_disk->private_data;
633 if (atomic_read(&d->closing)) 626 if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
634 return -ENXIO; 627 return -ENXIO;
635 628
636 closure_get(&d->cl); 629 closure_get(&d->cl);
@@ -659,20 +652,24 @@ static const struct block_device_operations bcache_ops = {
659 652
660void bcache_device_stop(struct bcache_device *d) 653void bcache_device_stop(struct bcache_device *d)
661{ 654{
662 if (!atomic_xchg(&d->closing, 1)) 655 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
663 closure_queue(&d->cl); 656 closure_queue(&d->cl);
664} 657}
665 658
666static void bcache_device_unlink(struct bcache_device *d) 659static void bcache_device_unlink(struct bcache_device *d)
667{ 660{
668 unsigned i; 661 lockdep_assert_held(&bch_register_lock);
669 struct cache *ca;
670 662
671 sysfs_remove_link(&d->c->kobj, d->name); 663 if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
672 sysfs_remove_link(&d->kobj, "cache"); 664 unsigned i;
665 struct cache *ca;
673 666
674 for_each_cache(ca, d->c, i) 667 sysfs_remove_link(&d->c->kobj, d->name);
675 bd_unlink_disk_holder(ca->bdev, d->disk); 668 sysfs_remove_link(&d->kobj, "cache");
669
670 for_each_cache(ca, d->c, i)
671 bd_unlink_disk_holder(ca->bdev, d->disk);
672 }
676} 673}
677 674
678static void bcache_device_link(struct bcache_device *d, struct cache_set *c, 675static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
@@ -696,19 +693,16 @@ static void bcache_device_detach(struct bcache_device *d)
696{ 693{
697 lockdep_assert_held(&bch_register_lock); 694 lockdep_assert_held(&bch_register_lock);
698 695
699 if (atomic_read(&d->detaching)) { 696 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
700 struct uuid_entry *u = d->c->uuids + d->id; 697 struct uuid_entry *u = d->c->uuids + d->id;
701 698
702 SET_UUID_FLASH_ONLY(u, 0); 699 SET_UUID_FLASH_ONLY(u, 0);
703 memcpy(u->uuid, invalid_uuid, 16); 700 memcpy(u->uuid, invalid_uuid, 16);
704 u->invalidated = cpu_to_le32(get_seconds()); 701 u->invalidated = cpu_to_le32(get_seconds());
705 bch_uuid_write(d->c); 702 bch_uuid_write(d->c);
706
707 atomic_set(&d->detaching, 0);
708 } 703 }
709 704
710 if (!d->flush_done) 705 bcache_device_unlink(d);
711 bcache_device_unlink(d);
712 706
713 d->c->devices[d->id] = NULL; 707 d->c->devices[d->id] = NULL;
714 closure_put(&d->c->caching); 708 closure_put(&d->c->caching);
@@ -739,14 +733,20 @@ static void bcache_device_free(struct bcache_device *d)
739 del_gendisk(d->disk); 733 del_gendisk(d->disk);
740 if (d->disk && d->disk->queue) 734 if (d->disk && d->disk->queue)
741 blk_cleanup_queue(d->disk->queue); 735 blk_cleanup_queue(d->disk->queue);
742 if (d->disk) 736 if (d->disk) {
737 ida_simple_remove(&bcache_minor, d->disk->first_minor);
743 put_disk(d->disk); 738 put_disk(d->disk);
739 }
744 740
745 bio_split_pool_free(&d->bio_split_hook); 741 bio_split_pool_free(&d->bio_split_hook);
746 if (d->unaligned_bvec) 742 if (d->unaligned_bvec)
747 mempool_destroy(d->unaligned_bvec); 743 mempool_destroy(d->unaligned_bvec);
748 if (d->bio_split) 744 if (d->bio_split)
749 bioset_free(d->bio_split); 745 bioset_free(d->bio_split);
746 if (is_vmalloc_addr(d->full_dirty_stripes))
747 vfree(d->full_dirty_stripes);
748 else
749 kfree(d->full_dirty_stripes);
750 if (is_vmalloc_addr(d->stripe_sectors_dirty)) 750 if (is_vmalloc_addr(d->stripe_sectors_dirty))
751 vfree(d->stripe_sectors_dirty); 751 vfree(d->stripe_sectors_dirty);
752 else 752 else
@@ -760,15 +760,19 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
760{ 760{
761 struct request_queue *q; 761 struct request_queue *q;
762 size_t n; 762 size_t n;
763 int minor;
763 764
764 if (!d->stripe_size_bits) 765 if (!d->stripe_size)
765 d->stripe_size_bits = 31; 766 d->stripe_size = 1 << 31;
766 767
767 d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >> 768 d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
768 d->stripe_size_bits;
769 769
770 if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) 770 if (!d->nr_stripes ||
771 d->nr_stripes > INT_MAX ||
772 d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
773 pr_err("nr_stripes too large");
771 return -ENOMEM; 774 return -ENOMEM;
775 }
772 776
773 n = d->nr_stripes * sizeof(atomic_t); 777 n = d->nr_stripes * sizeof(atomic_t);
774 d->stripe_sectors_dirty = n < PAGE_SIZE << 6 778 d->stripe_sectors_dirty = n < PAGE_SIZE << 6
@@ -777,22 +781,38 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
777 if (!d->stripe_sectors_dirty) 781 if (!d->stripe_sectors_dirty)
778 return -ENOMEM; 782 return -ENOMEM;
779 783
784 n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
785 d->full_dirty_stripes = n < PAGE_SIZE << 6
786 ? kzalloc(n, GFP_KERNEL)
787 : vzalloc(n);
788 if (!d->full_dirty_stripes)
789 return -ENOMEM;
790
791 minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
792 if (minor < 0)
793 return minor;
794
780 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 795 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
781 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, 796 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
782 sizeof(struct bio_vec) * BIO_MAX_PAGES)) || 797 sizeof(struct bio_vec) * BIO_MAX_PAGES)) ||
783 bio_split_pool_init(&d->bio_split_hook) || 798 bio_split_pool_init(&d->bio_split_hook) ||
784 !(d->disk = alloc_disk(1)) || 799 !(d->disk = alloc_disk(1))) {
785 !(q = blk_alloc_queue(GFP_KERNEL))) 800 ida_simple_remove(&bcache_minor, minor);
786 return -ENOMEM; 801 return -ENOMEM;
802 }
787 803
788 set_capacity(d->disk, sectors); 804 set_capacity(d->disk, sectors);
789 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); 805 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
790 806
791 d->disk->major = bcache_major; 807 d->disk->major = bcache_major;
792 d->disk->first_minor = bcache_minor++; 808 d->disk->first_minor = minor;
793 d->disk->fops = &bcache_ops; 809 d->disk->fops = &bcache_ops;
794 d->disk->private_data = d; 810 d->disk->private_data = d;
795 811
812 q = blk_alloc_queue(GFP_KERNEL);
813 if (!q)
814 return -ENOMEM;
815
796 blk_queue_make_request(q, NULL); 816 blk_queue_make_request(q, NULL);
797 d->disk->queue = q; 817 d->disk->queue = q;
798 q->queuedata = d; 818 q->queuedata = d;
@@ -874,7 +894,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
874 struct closure cl; 894 struct closure cl;
875 closure_init_stack(&cl); 895 closure_init_stack(&cl);
876 896
877 BUG_ON(!atomic_read(&dc->disk.detaching)); 897 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
878 BUG_ON(atomic_read(&dc->count)); 898 BUG_ON(atomic_read(&dc->count));
879 899
880 mutex_lock(&bch_register_lock); 900 mutex_lock(&bch_register_lock);
@@ -888,6 +908,8 @@ static void cached_dev_detach_finish(struct work_struct *w)
888 bcache_device_detach(&dc->disk); 908 bcache_device_detach(&dc->disk);
889 list_move(&dc->list, &uncached_devices); 909 list_move(&dc->list, &uncached_devices);
890 910
911 clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
912
891 mutex_unlock(&bch_register_lock); 913 mutex_unlock(&bch_register_lock);
892 914
893 pr_info("Caching disabled for %s", bdevname(dc->bdev, buf)); 915 pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
@@ -900,10 +922,10 @@ void bch_cached_dev_detach(struct cached_dev *dc)
900{ 922{
901 lockdep_assert_held(&bch_register_lock); 923 lockdep_assert_held(&bch_register_lock);
902 924
903 if (atomic_read(&dc->disk.closing)) 925 if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
904 return; 926 return;
905 927
906 if (atomic_xchg(&dc->disk.detaching, 1)) 928 if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
907 return; 929 return;
908 930
909 /* 931 /*
@@ -1030,6 +1052,7 @@ static void cached_dev_free(struct closure *cl)
1030 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); 1052 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1031 1053
1032 cancel_delayed_work_sync(&dc->writeback_rate_update); 1054 cancel_delayed_work_sync(&dc->writeback_rate_update);
1055 kthread_stop(dc->writeback_thread);
1033 1056
1034 mutex_lock(&bch_register_lock); 1057 mutex_lock(&bch_register_lock);
1035 1058
@@ -1058,11 +1081,7 @@ static void cached_dev_flush(struct closure *cl)
1058 struct bcache_device *d = &dc->disk; 1081 struct bcache_device *d = &dc->disk;
1059 1082
1060 mutex_lock(&bch_register_lock); 1083 mutex_lock(&bch_register_lock);
1061 d->flush_done = 1; 1084 bcache_device_unlink(d);
1062
1063 if (d->c)
1064 bcache_device_unlink(d);
1065
1066 mutex_unlock(&bch_register_lock); 1085 mutex_unlock(&bch_register_lock);
1067 1086
1068 bch_cache_accounting_destroy(&dc->accounting); 1087 bch_cache_accounting_destroy(&dc->accounting);
@@ -1088,7 +1107,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1088 spin_lock_init(&dc->io_lock); 1107 spin_lock_init(&dc->io_lock);
1089 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); 1108 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1090 1109
1091 dc->sequential_merge = true;
1092 dc->sequential_cutoff = 4 << 20; 1110 dc->sequential_cutoff = 4 << 20;
1093 1111
1094 for (io = dc->io; io < dc->io + RECENT_IO; io++) { 1112 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
@@ -1260,7 +1278,8 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1260{ 1278{
1261 va_list args; 1279 va_list args;
1262 1280
1263 if (test_bit(CACHE_SET_STOPPING, &c->flags)) 1281 if (c->on_error != ON_ERROR_PANIC &&
1282 test_bit(CACHE_SET_STOPPING, &c->flags))
1264 return false; 1283 return false;
1265 1284
1266 /* XXX: we can be called from atomic context 1285 /* XXX: we can be called from atomic context
@@ -1275,6 +1294,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1275 1294
1276 printk(", disabling caching\n"); 1295 printk(", disabling caching\n");
1277 1296
1297 if (c->on_error == ON_ERROR_PANIC)
1298 panic("panic forced after error\n");
1299
1278 bch_cache_set_unregister(c); 1300 bch_cache_set_unregister(c);
1279 return true; 1301 return true;
1280} 1302}
@@ -1339,6 +1361,9 @@ static void cache_set_flush(struct closure *cl)
1339 kobject_put(&c->internal); 1361 kobject_put(&c->internal);
1340 kobject_del(&c->kobj); 1362 kobject_del(&c->kobj);
1341 1363
1364 if (c->gc_thread)
1365 kthread_stop(c->gc_thread);
1366
1342 if (!IS_ERR_OR_NULL(c->root)) 1367 if (!IS_ERR_OR_NULL(c->root))
1343 list_add(&c->root->list, &c->btree_cache); 1368 list_add(&c->root->list, &c->btree_cache);
1344 1369
@@ -1433,12 +1458,19 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1433 1458
1434 c->sort_crit_factor = int_sqrt(c->btree_pages); 1459 c->sort_crit_factor = int_sqrt(c->btree_pages);
1435 1460
1436 mutex_init(&c->bucket_lock);
1437 mutex_init(&c->sort_lock);
1438 spin_lock_init(&c->sort_time_lock);
1439 closure_init_unlocked(&c->sb_write); 1461 closure_init_unlocked(&c->sb_write);
1462 mutex_init(&c->bucket_lock);
1463 init_waitqueue_head(&c->try_wait);
1464 init_waitqueue_head(&c->bucket_wait);
1440 closure_init_unlocked(&c->uuid_write); 1465 closure_init_unlocked(&c->uuid_write);
1441 spin_lock_init(&c->btree_read_time_lock); 1466 mutex_init(&c->sort_lock);
1467
1468 spin_lock_init(&c->sort_time.lock);
1469 spin_lock_init(&c->btree_gc_time.lock);
1470 spin_lock_init(&c->btree_split_time.lock);
1471 spin_lock_init(&c->btree_read_time.lock);
1472 spin_lock_init(&c->try_harder_time.lock);
1473
1442 bch_moving_init_cache_set(c); 1474 bch_moving_init_cache_set(c);
1443 1475
1444 INIT_LIST_HEAD(&c->list); 1476 INIT_LIST_HEAD(&c->list);
@@ -1483,11 +1515,10 @@ static void run_cache_set(struct cache_set *c)
1483 const char *err = "cannot allocate memory"; 1515 const char *err = "cannot allocate memory";
1484 struct cached_dev *dc, *t; 1516 struct cached_dev *dc, *t;
1485 struct cache *ca; 1517 struct cache *ca;
1518 struct closure cl;
1486 unsigned i; 1519 unsigned i;
1487 1520
1488 struct btree_op op; 1521 closure_init_stack(&cl);
1489 bch_btree_op_init_stack(&op);
1490 op.lock = SHRT_MAX;
1491 1522
1492 for_each_cache(ca, c, i) 1523 for_each_cache(ca, c, i)
1493 c->nbuckets += ca->sb.nbuckets; 1524 c->nbuckets += ca->sb.nbuckets;
@@ -1498,7 +1529,7 @@ static void run_cache_set(struct cache_set *c)
1498 struct jset *j; 1529 struct jset *j;
1499 1530
1500 err = "cannot allocate memory for journal"; 1531 err = "cannot allocate memory for journal";
1501 if (bch_journal_read(c, &journal, &op)) 1532 if (bch_journal_read(c, &journal))
1502 goto err; 1533 goto err;
1503 1534
1504 pr_debug("btree_journal_read() done"); 1535 pr_debug("btree_journal_read() done");
@@ -1522,23 +1553,23 @@ static void run_cache_set(struct cache_set *c)
1522 k = &j->btree_root; 1553 k = &j->btree_root;
1523 1554
1524 err = "bad btree root"; 1555 err = "bad btree root";
1525 if (__bch_ptr_invalid(c, j->btree_level + 1, k)) 1556 if (bch_btree_ptr_invalid(c, k))
1526 goto err; 1557 goto err;
1527 1558
1528 err = "error reading btree root"; 1559 err = "error reading btree root";
1529 c->root = bch_btree_node_get(c, k, j->btree_level, &op); 1560 c->root = bch_btree_node_get(c, k, j->btree_level, true);
1530 if (IS_ERR_OR_NULL(c->root)) 1561 if (IS_ERR_OR_NULL(c->root))
1531 goto err; 1562 goto err;
1532 1563
1533 list_del_init(&c->root->list); 1564 list_del_init(&c->root->list);
1534 rw_unlock(true, c->root); 1565 rw_unlock(true, c->root);
1535 1566
1536 err = uuid_read(c, j, &op.cl); 1567 err = uuid_read(c, j, &cl);
1537 if (err) 1568 if (err)
1538 goto err; 1569 goto err;
1539 1570
1540 err = "error in recovery"; 1571 err = "error in recovery";
1541 if (bch_btree_check(c, &op)) 1572 if (bch_btree_check(c))
1542 goto err; 1573 goto err;
1543 1574
1544 bch_journal_mark(c, &journal); 1575 bch_journal_mark(c, &journal);
@@ -1570,11 +1601,9 @@ static void run_cache_set(struct cache_set *c)
1570 if (j->version < BCACHE_JSET_VERSION_UUID) 1601 if (j->version < BCACHE_JSET_VERSION_UUID)
1571 __uuid_write(c); 1602 __uuid_write(c);
1572 1603
1573 bch_journal_replay(c, &journal, &op); 1604 bch_journal_replay(c, &journal);
1574 } else { 1605 } else {
1575 pr_notice("invalidating existing data"); 1606 pr_notice("invalidating existing data");
1576 /* Don't want invalidate_buckets() to queue a gc yet */
1577 closure_lock(&c->gc, NULL);
1578 1607
1579 for_each_cache(ca, c, i) { 1608 for_each_cache(ca, c, i) {
1580 unsigned j; 1609 unsigned j;
@@ -1600,15 +1629,15 @@ static void run_cache_set(struct cache_set *c)
1600 1629
1601 err = "cannot allocate new UUID bucket"; 1630 err = "cannot allocate new UUID bucket";
1602 if (__uuid_write(c)) 1631 if (__uuid_write(c))
1603 goto err_unlock_gc; 1632 goto err;
1604 1633
1605 err = "cannot allocate new btree root"; 1634 err = "cannot allocate new btree root";
1606 c->root = bch_btree_node_alloc(c, 0, &op.cl); 1635 c->root = bch_btree_node_alloc(c, 0, true);
1607 if (IS_ERR_OR_NULL(c->root)) 1636 if (IS_ERR_OR_NULL(c->root))
1608 goto err_unlock_gc; 1637 goto err;
1609 1638
1610 bkey_copy_key(&c->root->key, &MAX_KEY); 1639 bkey_copy_key(&c->root->key, &MAX_KEY);
1611 bch_btree_node_write(c->root, &op.cl); 1640 bch_btree_node_write(c->root, &cl);
1612 1641
1613 bch_btree_set_root(c->root); 1642 bch_btree_set_root(c->root);
1614 rw_unlock(true, c->root); 1643 rw_unlock(true, c->root);
@@ -1621,14 +1650,14 @@ static void run_cache_set(struct cache_set *c)
1621 SET_CACHE_SYNC(&c->sb, true); 1650 SET_CACHE_SYNC(&c->sb, true);
1622 1651
1623 bch_journal_next(&c->journal); 1652 bch_journal_next(&c->journal);
1624 bch_journal_meta(c, &op.cl); 1653 bch_journal_meta(c, &cl);
1625
1626 /* Unlock */
1627 closure_set_stopped(&c->gc.cl);
1628 closure_put(&c->gc.cl);
1629 } 1654 }
1630 1655
1631 closure_sync(&op.cl); 1656 err = "error starting gc thread";
1657 if (bch_gc_thread_start(c))
1658 goto err;
1659
1660 closure_sync(&cl);
1632 c->sb.last_mount = get_seconds(); 1661 c->sb.last_mount = get_seconds();
1633 bcache_write_super(c); 1662 bcache_write_super(c);
1634 1663
@@ -1638,13 +1667,10 @@ static void run_cache_set(struct cache_set *c)
1638 flash_devs_run(c); 1667 flash_devs_run(c);
1639 1668
1640 return; 1669 return;
1641err_unlock_gc:
1642 closure_set_stopped(&c->gc.cl);
1643 closure_put(&c->gc.cl);
1644err: 1670err:
1645 closure_sync(&op.cl); 1671 closure_sync(&cl);
1646 /* XXX: test this, it's broken */ 1672 /* XXX: test this, it's broken */
1647 bch_cache_set_error(c, err); 1673 bch_cache_set_error(c, "%s", err);
1648} 1674}
1649 1675
1650static bool can_attach_cache(struct cache *ca, struct cache_set *c) 1676static bool can_attach_cache(struct cache *ca, struct cache_set *c)
@@ -1725,8 +1751,6 @@ void bch_cache_release(struct kobject *kobj)
1725 if (ca->set) 1751 if (ca->set)
1726 ca->set->cache[ca->sb.nr_this_dev] = NULL; 1752 ca->set->cache[ca->sb.nr_this_dev] = NULL;
1727 1753
1728 bch_cache_allocator_exit(ca);
1729
1730 bio_split_pool_free(&ca->bio_split_hook); 1754 bio_split_pool_free(&ca->bio_split_hook);
1731 1755
1732 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); 1756 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
@@ -1758,8 +1782,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1758 __module_get(THIS_MODULE); 1782 __module_get(THIS_MODULE);
1759 kobject_init(&ca->kobj, &bch_cache_ktype); 1783 kobject_init(&ca->kobj, &bch_cache_ktype);
1760 1784
1761 INIT_LIST_HEAD(&ca->discards);
1762
1763 bio_init(&ca->journal.bio); 1785 bio_init(&ca->journal.bio);
1764 ca->journal.bio.bi_max_vecs = 8; 1786 ca->journal.bio.bi_max_vecs = 8;
1765 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; 1787 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
@@ -2006,7 +2028,6 @@ static struct notifier_block reboot = {
2006static void bcache_exit(void) 2028static void bcache_exit(void)
2007{ 2029{
2008 bch_debug_exit(); 2030 bch_debug_exit();
2009 bch_writeback_exit();
2010 bch_request_exit(); 2031 bch_request_exit();
2011 bch_btree_exit(); 2032 bch_btree_exit();
2012 if (bcache_kobj) 2033 if (bcache_kobj)
@@ -2039,7 +2060,6 @@ static int __init bcache_init(void)
2039 sysfs_create_files(bcache_kobj, files) || 2060 sysfs_create_files(bcache_kobj, files) ||
2040 bch_btree_init() || 2061 bch_btree_init() ||
2041 bch_request_init() || 2062 bch_request_init() ||
2042 bch_writeback_init() ||
2043 bch_debug_init(bcache_kobj)) 2063 bch_debug_init(bcache_kobj))
2044 goto err; 2064 goto err;
2045 2065
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 924dcfdae111..80d4c2bee18a 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -21,6 +21,12 @@ static const char * const cache_replacement_policies[] = {
21 NULL 21 NULL
22}; 22};
23 23
24static const char * const error_actions[] = {
25 "unregister",
26 "panic",
27 NULL
28};
29
24write_attribute(attach); 30write_attribute(attach);
25write_attribute(detach); 31write_attribute(detach);
26write_attribute(unregister); 32write_attribute(unregister);
@@ -66,7 +72,6 @@ rw_attribute(congested_read_threshold_us);
66rw_attribute(congested_write_threshold_us); 72rw_attribute(congested_write_threshold_us);
67 73
68rw_attribute(sequential_cutoff); 74rw_attribute(sequential_cutoff);
69rw_attribute(sequential_merge);
70rw_attribute(data_csum); 75rw_attribute(data_csum);
71rw_attribute(cache_mode); 76rw_attribute(cache_mode);
72rw_attribute(writeback_metadata); 77rw_attribute(writeback_metadata);
@@ -90,11 +95,14 @@ rw_attribute(discard);
90rw_attribute(running); 95rw_attribute(running);
91rw_attribute(label); 96rw_attribute(label);
92rw_attribute(readahead); 97rw_attribute(readahead);
98rw_attribute(errors);
93rw_attribute(io_error_limit); 99rw_attribute(io_error_limit);
94rw_attribute(io_error_halflife); 100rw_attribute(io_error_halflife);
95rw_attribute(verify); 101rw_attribute(verify);
102rw_attribute(bypass_torture_test);
96rw_attribute(key_merging_disabled); 103rw_attribute(key_merging_disabled);
97rw_attribute(gc_always_rewrite); 104rw_attribute(gc_always_rewrite);
105rw_attribute(expensive_debug_checks);
98rw_attribute(freelist_percent); 106rw_attribute(freelist_percent);
99rw_attribute(cache_replacement_policy); 107rw_attribute(cache_replacement_policy);
100rw_attribute(btree_shrinker_disabled); 108rw_attribute(btree_shrinker_disabled);
@@ -116,6 +124,7 @@ SHOW(__bch_cached_dev)
116 124
117 sysfs_printf(data_csum, "%i", dc->disk.data_csum); 125 sysfs_printf(data_csum, "%i", dc->disk.data_csum);
118 var_printf(verify, "%i"); 126 var_printf(verify, "%i");
127 var_printf(bypass_torture_test, "%i");
119 var_printf(writeback_metadata, "%i"); 128 var_printf(writeback_metadata, "%i");
120 var_printf(writeback_running, "%i"); 129 var_printf(writeback_running, "%i");
121 var_print(writeback_delay); 130 var_print(writeback_delay);
@@ -150,10 +159,9 @@ SHOW(__bch_cached_dev)
150 sysfs_hprint(dirty_data, 159 sysfs_hprint(dirty_data,
151 bcache_dev_sectors_dirty(&dc->disk) << 9); 160 bcache_dev_sectors_dirty(&dc->disk) << 9);
152 161
153 sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9); 162 sysfs_hprint(stripe_size, dc->disk.stripe_size << 9);
154 var_printf(partial_stripes_expensive, "%u"); 163 var_printf(partial_stripes_expensive, "%u");
155 164
156 var_printf(sequential_merge, "%i");
157 var_hprint(sequential_cutoff); 165 var_hprint(sequential_cutoff);
158 var_hprint(readahead); 166 var_hprint(readahead);
159 167
@@ -185,6 +193,7 @@ STORE(__cached_dev)
185 193
186 sysfs_strtoul(data_csum, dc->disk.data_csum); 194 sysfs_strtoul(data_csum, dc->disk.data_csum);
187 d_strtoul(verify); 195 d_strtoul(verify);
196 d_strtoul(bypass_torture_test);
188 d_strtoul(writeback_metadata); 197 d_strtoul(writeback_metadata);
189 d_strtoul(writeback_running); 198 d_strtoul(writeback_running);
190 d_strtoul(writeback_delay); 199 d_strtoul(writeback_delay);
@@ -199,7 +208,6 @@ STORE(__cached_dev)
199 dc->writeback_rate_p_term_inverse, 1, INT_MAX); 208 dc->writeback_rate_p_term_inverse, 1, INT_MAX);
200 d_strtoul(writeback_rate_d_smooth); 209 d_strtoul(writeback_rate_d_smooth);
201 210
202 d_strtoul(sequential_merge);
203 d_strtoi_h(sequential_cutoff); 211 d_strtoi_h(sequential_cutoff);
204 d_strtoi_h(readahead); 212 d_strtoi_h(readahead);
205 213
@@ -311,7 +319,6 @@ static struct attribute *bch_cached_dev_files[] = {
311 &sysfs_stripe_size, 319 &sysfs_stripe_size,
312 &sysfs_partial_stripes_expensive, 320 &sysfs_partial_stripes_expensive,
313 &sysfs_sequential_cutoff, 321 &sysfs_sequential_cutoff,
314 &sysfs_sequential_merge,
315 &sysfs_clear_stats, 322 &sysfs_clear_stats,
316 &sysfs_running, 323 &sysfs_running,
317 &sysfs_state, 324 &sysfs_state,
@@ -319,6 +326,7 @@ static struct attribute *bch_cached_dev_files[] = {
319 &sysfs_readahead, 326 &sysfs_readahead,
320#ifdef CONFIG_BCACHE_DEBUG 327#ifdef CONFIG_BCACHE_DEBUG
321 &sysfs_verify, 328 &sysfs_verify,
329 &sysfs_bypass_torture_test,
322#endif 330#endif
323 NULL 331 NULL
324}; 332};
@@ -366,7 +374,7 @@ STORE(__bch_flash_dev)
366 } 374 }
367 375
368 if (attr == &sysfs_unregister) { 376 if (attr == &sysfs_unregister) {
369 atomic_set(&d->detaching, 1); 377 set_bit(BCACHE_DEV_DETACHING, &d->flags);
370 bcache_device_stop(d); 378 bcache_device_stop(d);
371 } 379 }
372 380
@@ -481,7 +489,6 @@ lock_root:
481 489
482 sysfs_print(btree_used_percent, btree_used(c)); 490 sysfs_print(btree_used_percent, btree_used(c));
483 sysfs_print(btree_nodes, c->gc_stats.nodes); 491 sysfs_print(btree_nodes, c->gc_stats.nodes);
484 sysfs_hprint(dirty_data, c->gc_stats.dirty);
485 sysfs_hprint(average_key_size, average_key_size(c)); 492 sysfs_hprint(average_key_size, average_key_size(c));
486 493
487 sysfs_print(cache_read_races, 494 sysfs_print(cache_read_races,
@@ -492,6 +499,10 @@ lock_root:
492 sysfs_print(writeback_keys_failed, 499 sysfs_print(writeback_keys_failed,
493 atomic_long_read(&c->writeback_keys_failed)); 500 atomic_long_read(&c->writeback_keys_failed));
494 501
502 if (attr == &sysfs_errors)
503 return bch_snprint_string_list(buf, PAGE_SIZE, error_actions,
504 c->on_error);
505
495 /* See count_io_errors for why 88 */ 506 /* See count_io_errors for why 88 */
496 sysfs_print(io_error_halflife, c->error_decay * 88); 507 sysfs_print(io_error_halflife, c->error_decay * 88);
497 sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); 508 sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT);
@@ -506,6 +517,8 @@ lock_root:
506 sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); 517 sysfs_print(active_journal_entries, fifo_used(&c->journal.pin));
507 sysfs_printf(verify, "%i", c->verify); 518 sysfs_printf(verify, "%i", c->verify);
508 sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); 519 sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled);
520 sysfs_printf(expensive_debug_checks,
521 "%i", c->expensive_debug_checks);
509 sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); 522 sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
510 sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); 523 sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
511 sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); 524 sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
@@ -555,7 +568,7 @@ STORE(__bch_cache_set)
555 } 568 }
556 569
557 if (attr == &sysfs_trigger_gc) 570 if (attr == &sysfs_trigger_gc)
558 bch_queue_gc(c); 571 wake_up_gc(c);
559 572
560 if (attr == &sysfs_prune_cache) { 573 if (attr == &sysfs_prune_cache) {
561 struct shrink_control sc; 574 struct shrink_control sc;
@@ -569,6 +582,15 @@ STORE(__bch_cache_set)
569 sysfs_strtoul(congested_write_threshold_us, 582 sysfs_strtoul(congested_write_threshold_us,
570 c->congested_write_threshold_us); 583 c->congested_write_threshold_us);
571 584
585 if (attr == &sysfs_errors) {
586 ssize_t v = bch_read_string_list(buf, error_actions);
587
588 if (v < 0)
589 return v;
590
591 c->on_error = v;
592 }
593
572 if (attr == &sysfs_io_error_limit) 594 if (attr == &sysfs_io_error_limit)
573 c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; 595 c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
574 596
@@ -579,6 +601,7 @@ STORE(__bch_cache_set)
579 sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); 601 sysfs_strtoul(journal_delay_ms, c->journal_delay_ms);
580 sysfs_strtoul(verify, c->verify); 602 sysfs_strtoul(verify, c->verify);
581 sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); 603 sysfs_strtoul(key_merging_disabled, c->key_merging_disabled);
604 sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks);
582 sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); 605 sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite);
583 sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); 606 sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled);
584 sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); 607 sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled);
@@ -618,8 +641,8 @@ static struct attribute *bch_cache_set_files[] = {
618 &sysfs_cache_available_percent, 641 &sysfs_cache_available_percent,
619 642
620 &sysfs_average_key_size, 643 &sysfs_average_key_size,
621 &sysfs_dirty_data,
622 644
645 &sysfs_errors,
623 &sysfs_io_error_limit, 646 &sysfs_io_error_limit,
624 &sysfs_io_error_halflife, 647 &sysfs_io_error_halflife,
625 &sysfs_congested, 648 &sysfs_congested,
@@ -653,6 +676,7 @@ static struct attribute *bch_cache_set_internal_files[] = {
653#ifdef CONFIG_BCACHE_DEBUG 676#ifdef CONFIG_BCACHE_DEBUG
654 &sysfs_verify, 677 &sysfs_verify,
655 &sysfs_key_merging_disabled, 678 &sysfs_key_merging_disabled,
679 &sysfs_expensive_debug_checks,
656#endif 680#endif
657 &sysfs_gc_always_rewrite, 681 &sysfs_gc_always_rewrite,
658 &sysfs_btree_shrinker_disabled, 682 &sysfs_btree_shrinker_disabled,
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
index f7b6c197f90f..adbc3df17a80 100644
--- a/drivers/md/bcache/trace.c
+++ b/drivers/md/bcache/trace.c
@@ -1,6 +1,5 @@
1#include "bcache.h" 1#include "bcache.h"
2#include "btree.h" 2#include "btree.h"
3#include "request.h"
4 3
5#include <linux/blktrace_api.h> 4#include <linux/blktrace_api.h>
6#include <linux/module.h> 5#include <linux/module.h>
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 420dad545c7d..462214eeacbe 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -168,10 +168,14 @@ int bch_parse_uuid(const char *s, char *uuid)
168 168
169void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) 169void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
170{ 170{
171 uint64_t now = local_clock(); 171 uint64_t now, duration, last;
172 uint64_t duration = time_after64(now, start_time) 172
173 spin_lock(&stats->lock);
174
175 now = local_clock();
176 duration = time_after64(now, start_time)
173 ? now - start_time : 0; 177 ? now - start_time : 0;
174 uint64_t last = time_after64(now, stats->last) 178 last = time_after64(now, stats->last)
175 ? now - stats->last : 0; 179 ? now - stats->last : 0;
176 180
177 stats->max_duration = max(stats->max_duration, duration); 181 stats->max_duration = max(stats->max_duration, duration);
@@ -188,6 +192,8 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
188 } 192 }
189 193
190 stats->last = now ?: 1; 194 stats->last = now ?: 1;
195
196 spin_unlock(&stats->lock);
191} 197}
192 198
193/** 199/**
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ea345c6896f4..362c4b3f8b4a 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -15,28 +15,18 @@
15 15
16struct closure; 16struct closure;
17 17
18#ifdef CONFIG_BCACHE_EDEBUG 18#ifdef CONFIG_BCACHE_DEBUG
19 19
20#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) 20#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
21#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) 21#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i)
22 22
23#else /* EDEBUG */ 23#else /* DEBUG */
24 24
25#define atomic_dec_bug(v) atomic_dec(v) 25#define atomic_dec_bug(v) atomic_dec(v)
26#define atomic_inc_bug(v, i) atomic_inc(v) 26#define atomic_inc_bug(v, i) atomic_inc(v)
27 27
28#endif 28#endif
29 29
30#define BITMASK(name, type, field, offset, size) \
31static inline uint64_t name(const type *k) \
32{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \
33 \
34static inline void SET_##name(type *k, uint64_t v) \
35{ \
36 k->field &= ~(~((uint64_t) ~0 << size) << offset); \
37 k->field |= v << offset; \
38}
39
40#define DECLARE_HEAP(type, name) \ 30#define DECLARE_HEAP(type, name) \
41 struct { \ 31 struct { \
42 size_t size, used; \ 32 size_t size, used; \
@@ -388,6 +378,7 @@ ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[
388ssize_t bch_read_string_list(const char *buf, const char * const list[]); 378ssize_t bch_read_string_list(const char *buf, const char * const list[]);
389 379
390struct time_stats { 380struct time_stats {
381 spinlock_t lock;
391 /* 382 /*
392 * all fields are in nanoseconds, averages are ewmas stored left shifted 383 * all fields are in nanoseconds, averages are ewmas stored left shifted
393 * by 8 384 * by 8
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index ba3ee48320f2..99053b1251be 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -11,18 +11,11 @@
11#include "debug.h" 11#include "debug.h"
12#include "writeback.h" 12#include "writeback.h"
13 13
14#include <linux/delay.h>
15#include <linux/freezer.h>
16#include <linux/kthread.h>
14#include <trace/events/bcache.h> 17#include <trace/events/bcache.h>
15 18
16static struct workqueue_struct *dirty_wq;
17
18static void read_dirty(struct closure *);
19
20struct dirty_io {
21 struct closure cl;
22 struct cached_dev *dc;
23 struct bio bio;
24};
25
26/* Rate limiting */ 19/* Rate limiting */
27 20
28static void __update_writeback_rate(struct cached_dev *dc) 21static void __update_writeback_rate(struct cached_dev *dc)
@@ -72,9 +65,6 @@ out:
72 dc->writeback_rate_derivative = derivative; 65 dc->writeback_rate_derivative = derivative;
73 dc->writeback_rate_change = change; 66 dc->writeback_rate_change = change;
74 dc->writeback_rate_target = target; 67 dc->writeback_rate_target = target;
75
76 schedule_delayed_work(&dc->writeback_rate_update,
77 dc->writeback_rate_update_seconds * HZ);
78} 68}
79 69
80static void update_writeback_rate(struct work_struct *work) 70static void update_writeback_rate(struct work_struct *work)
@@ -90,13 +80,16 @@ static void update_writeback_rate(struct work_struct *work)
90 __update_writeback_rate(dc); 80 __update_writeback_rate(dc);
91 81
92 up_read(&dc->writeback_lock); 82 up_read(&dc->writeback_lock);
83
84 schedule_delayed_work(&dc->writeback_rate_update,
85 dc->writeback_rate_update_seconds * HZ);
93} 86}
94 87
95static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) 88static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
96{ 89{
97 uint64_t ret; 90 uint64_t ret;
98 91
99 if (atomic_read(&dc->disk.detaching) || 92 if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
100 !dc->writeback_percent) 93 !dc->writeback_percent)
101 return 0; 94 return 0;
102 95
@@ -105,37 +98,11 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
105 return min_t(uint64_t, ret, HZ); 98 return min_t(uint64_t, ret, HZ);
106} 99}
107 100
108/* Background writeback */ 101struct dirty_io {
109 102 struct closure cl;
110static bool dirty_pred(struct keybuf *buf, struct bkey *k) 103 struct cached_dev *dc;
111{ 104 struct bio bio;
112 return KEY_DIRTY(k); 105};
113}
114
115static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
116{
117 uint64_t stripe;
118 unsigned nr_sectors = KEY_SIZE(k);
119 struct cached_dev *dc = container_of(buf, struct cached_dev,
120 writeback_keys);
121 unsigned stripe_size = 1 << dc->disk.stripe_size_bits;
122
123 if (!KEY_DIRTY(k))
124 return false;
125
126 stripe = KEY_START(k) >> dc->disk.stripe_size_bits;
127 while (1) {
128 if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) !=
129 stripe_size)
130 return false;
131
132 if (nr_sectors <= stripe_size)
133 return true;
134
135 nr_sectors -= stripe_size;
136 stripe++;
137 }
138}
139 106
140static void dirty_init(struct keybuf_key *w) 107static void dirty_init(struct keybuf_key *w)
141{ 108{
@@ -153,131 +120,6 @@ static void dirty_init(struct keybuf_key *w)
153 bch_bio_map(bio, NULL); 120 bch_bio_map(bio, NULL);
154} 121}
155 122
156static void refill_dirty(struct closure *cl)
157{
158 struct cached_dev *dc = container_of(cl, struct cached_dev,
159 writeback.cl);
160 struct keybuf *buf = &dc->writeback_keys;
161 bool searched_from_start = false;
162 struct bkey end = MAX_KEY;
163 SET_KEY_INODE(&end, dc->disk.id);
164
165 if (!atomic_read(&dc->disk.detaching) &&
166 !dc->writeback_running)
167 closure_return(cl);
168
169 down_write(&dc->writeback_lock);
170
171 if (!atomic_read(&dc->has_dirty)) {
172 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
173 bch_write_bdev_super(dc, NULL);
174
175 up_write(&dc->writeback_lock);
176 closure_return(cl);
177 }
178
179 if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
180 buf->last_scanned = KEY(dc->disk.id, 0, 0);
181 searched_from_start = true;
182 }
183
184 if (dc->partial_stripes_expensive) {
185 uint64_t i;
186
187 for (i = 0; i < dc->disk.nr_stripes; i++)
188 if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
189 1 << dc->disk.stripe_size_bits)
190 goto full_stripes;
191
192 goto normal_refill;
193full_stripes:
194 bch_refill_keybuf(dc->disk.c, buf, &end,
195 dirty_full_stripe_pred);
196 } else {
197normal_refill:
198 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
199 }
200
201 if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
202 /* Searched the entire btree - delay awhile */
203
204 if (RB_EMPTY_ROOT(&buf->keys)) {
205 atomic_set(&dc->has_dirty, 0);
206 cached_dev_put(dc);
207 }
208
209 if (!atomic_read(&dc->disk.detaching))
210 closure_delay(&dc->writeback, dc->writeback_delay * HZ);
211 }
212
213 up_write(&dc->writeback_lock);
214
215 bch_ratelimit_reset(&dc->writeback_rate);
216
217 /* Punt to workqueue only so we don't recurse and blow the stack */
218 continue_at(cl, read_dirty, dirty_wq);
219}
220
221void bch_writeback_queue(struct cached_dev *dc)
222{
223 if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) {
224 if (!atomic_read(&dc->disk.detaching))
225 closure_delay(&dc->writeback, dc->writeback_delay * HZ);
226
227 continue_at(&dc->writeback.cl, refill_dirty, dirty_wq);
228 }
229}
230
231void bch_writeback_add(struct cached_dev *dc)
232{
233 if (!atomic_read(&dc->has_dirty) &&
234 !atomic_xchg(&dc->has_dirty, 1)) {
235 atomic_inc(&dc->count);
236
237 if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
238 SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
239 /* XXX: should do this synchronously */
240 bch_write_bdev_super(dc, NULL);
241 }
242
243 bch_writeback_queue(dc);
244
245 if (dc->writeback_percent)
246 schedule_delayed_work(&dc->writeback_rate_update,
247 dc->writeback_rate_update_seconds * HZ);
248 }
249}
250
251void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
252 uint64_t offset, int nr_sectors)
253{
254 struct bcache_device *d = c->devices[inode];
255 unsigned stripe_size, stripe_offset;
256 uint64_t stripe;
257
258 if (!d)
259 return;
260
261 stripe_size = 1 << d->stripe_size_bits;
262 stripe = offset >> d->stripe_size_bits;
263 stripe_offset = offset & (stripe_size - 1);
264
265 while (nr_sectors) {
266 int s = min_t(unsigned, abs(nr_sectors),
267 stripe_size - stripe_offset);
268
269 if (nr_sectors < 0)
270 s = -s;
271
272 atomic_add(s, d->stripe_sectors_dirty + stripe);
273 nr_sectors -= s;
274 stripe_offset = 0;
275 stripe++;
276 }
277}
278
279/* Background writeback - IO loop */
280
281static void dirty_io_destructor(struct closure *cl) 123static void dirty_io_destructor(struct closure *cl)
282{ 124{
283 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 125 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
@@ -297,26 +139,25 @@ static void write_dirty_finish(struct closure *cl)
297 139
298 /* This is kind of a dumb way of signalling errors. */ 140 /* This is kind of a dumb way of signalling errors. */
299 if (KEY_DIRTY(&w->key)) { 141 if (KEY_DIRTY(&w->key)) {
142 int ret;
300 unsigned i; 143 unsigned i;
301 struct btree_op op; 144 struct keylist keys;
302 bch_btree_op_init_stack(&op);
303 145
304 op.type = BTREE_REPLACE; 146 bch_keylist_init(&keys);
305 bkey_copy(&op.replace, &w->key);
306 147
307 SET_KEY_DIRTY(&w->key, false); 148 bkey_copy(keys.top, &w->key);
308 bch_keylist_add(&op.keys, &w->key); 149 SET_KEY_DIRTY(keys.top, false);
150 bch_keylist_push(&keys);
309 151
310 for (i = 0; i < KEY_PTRS(&w->key); i++) 152 for (i = 0; i < KEY_PTRS(&w->key); i++)
311 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); 153 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
312 154
313 bch_btree_insert(&op, dc->disk.c); 155 ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
314 closure_sync(&op.cl);
315 156
316 if (op.insert_collision) 157 if (ret)
317 trace_bcache_writeback_collision(&w->key); 158 trace_bcache_writeback_collision(&w->key);
318 159
319 atomic_long_inc(op.insert_collision 160 atomic_long_inc(ret
320 ? &dc->disk.c->writeback_keys_failed 161 ? &dc->disk.c->writeback_keys_failed
321 : &dc->disk.c->writeback_keys_done); 162 : &dc->disk.c->writeback_keys_done);
322 } 163 }
@@ -374,30 +215,33 @@ static void read_dirty_submit(struct closure *cl)
374 continue_at(cl, write_dirty, system_wq); 215 continue_at(cl, write_dirty, system_wq);
375} 216}
376 217
377static void read_dirty(struct closure *cl) 218static void read_dirty(struct cached_dev *dc)
378{ 219{
379 struct cached_dev *dc = container_of(cl, struct cached_dev, 220 unsigned delay = 0;
380 writeback.cl);
381 unsigned delay = writeback_delay(dc, 0);
382 struct keybuf_key *w; 221 struct keybuf_key *w;
383 struct dirty_io *io; 222 struct dirty_io *io;
223 struct closure cl;
224
225 closure_init_stack(&cl);
384 226
385 /* 227 /*
386 * XXX: if we error, background writeback just spins. Should use some 228 * XXX: if we error, background writeback just spins. Should use some
387 * mempools. 229 * mempools.
388 */ 230 */
389 231
390 while (1) { 232 while (!kthread_should_stop()) {
233 try_to_freeze();
234
391 w = bch_keybuf_next(&dc->writeback_keys); 235 w = bch_keybuf_next(&dc->writeback_keys);
392 if (!w) 236 if (!w)
393 break; 237 break;
394 238
395 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); 239 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
396 240
397 if (delay > 0 && 241 if (KEY_START(&w->key) != dc->last_read ||
398 (KEY_START(&w->key) != dc->last_read || 242 jiffies_to_msecs(delay) > 50)
399 jiffies_to_msecs(delay) > 50)) 243 while (!kthread_should_stop() && delay)
400 delay = schedule_timeout_uninterruptible(delay); 244 delay = schedule_timeout_interruptible(delay);
401 245
402 dc->last_read = KEY_OFFSET(&w->key); 246 dc->last_read = KEY_OFFSET(&w->key);
403 247
@@ -423,7 +267,7 @@ static void read_dirty(struct closure *cl)
423 trace_bcache_writeback(&w->key); 267 trace_bcache_writeback(&w->key);
424 268
425 down(&dc->in_flight); 269 down(&dc->in_flight);
426 closure_call(&io->cl, read_dirty_submit, NULL, cl); 270 closure_call(&io->cl, read_dirty_submit, NULL, &cl);
427 271
428 delay = writeback_delay(dc, KEY_SIZE(&w->key)); 272 delay = writeback_delay(dc, KEY_SIZE(&w->key));
429 } 273 }
@@ -439,52 +283,205 @@ err:
439 * Wait for outstanding writeback IOs to finish (and keybuf slots to be 283 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
440 * freed) before refilling again 284 * freed) before refilling again
441 */ 285 */
442 continue_at(cl, refill_dirty, dirty_wq); 286 closure_sync(&cl);
443} 287}
444 288
445/* Init */ 289/* Scan for dirty data */
290
291void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
292 uint64_t offset, int nr_sectors)
293{
294 struct bcache_device *d = c->devices[inode];
295 unsigned stripe_offset, stripe, sectors_dirty;
296
297 if (!d)
298 return;
299
300 stripe = offset_to_stripe(d, offset);
301 stripe_offset = offset & (d->stripe_size - 1);
302
303 while (nr_sectors) {
304 int s = min_t(unsigned, abs(nr_sectors),
305 d->stripe_size - stripe_offset);
306
307 if (nr_sectors < 0)
308 s = -s;
309
310 if (stripe >= d->nr_stripes)
311 return;
312
313 sectors_dirty = atomic_add_return(s,
314 d->stripe_sectors_dirty + stripe);
315 if (sectors_dirty == d->stripe_size)
316 set_bit(stripe, d->full_dirty_stripes);
317 else
318 clear_bit(stripe, d->full_dirty_stripes);
319
320 nr_sectors -= s;
321 stripe_offset = 0;
322 stripe++;
323 }
324}
446 325
447static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, 326static bool dirty_pred(struct keybuf *buf, struct bkey *k)
448 struct cached_dev *dc)
449{ 327{
450 struct bkey *k; 328 return KEY_DIRTY(k);
451 struct btree_iter iter; 329}
452 330
453 bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); 331static void refill_full_stripes(struct cached_dev *dc)
454 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) 332{
455 if (!b->level) { 333 struct keybuf *buf = &dc->writeback_keys;
456 if (KEY_INODE(k) > dc->disk.id) 334 unsigned start_stripe, stripe, next_stripe;
457 break; 335 bool wrapped = false;
458 336
459 if (KEY_DIRTY(k)) 337 stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
460 bcache_dev_sectors_dirty_add(b->c, dc->disk.id, 338
461 KEY_START(k), 339 if (stripe >= dc->disk.nr_stripes)
462 KEY_SIZE(k)); 340 stripe = 0;
463 } else { 341
464 btree(sectors_dirty_init, k, b, op, dc); 342 start_stripe = stripe;
465 if (KEY_INODE(k) > dc->disk.id) 343
466 break; 344 while (1) {
467 345 stripe = find_next_bit(dc->disk.full_dirty_stripes,
468 cond_resched(); 346 dc->disk.nr_stripes, stripe);
347
348 if (stripe == dc->disk.nr_stripes)
349 goto next;
350
351 next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
352 dc->disk.nr_stripes, stripe);
353
354 buf->last_scanned = KEY(dc->disk.id,
355 stripe * dc->disk.stripe_size, 0);
356
357 bch_refill_keybuf(dc->disk.c, buf,
358 &KEY(dc->disk.id,
359 next_stripe * dc->disk.stripe_size, 0),
360 dirty_pred);
361
362 if (array_freelist_empty(&buf->freelist))
363 return;
364
365 stripe = next_stripe;
366next:
367 if (wrapped && stripe > start_stripe)
368 return;
369
370 if (stripe == dc->disk.nr_stripes) {
371 stripe = 0;
372 wrapped = true;
469 } 373 }
374 }
375}
376
377static bool refill_dirty(struct cached_dev *dc)
378{
379 struct keybuf *buf = &dc->writeback_keys;
380 struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
381 bool searched_from_start = false;
382
383 if (dc->partial_stripes_expensive) {
384 refill_full_stripes(dc);
385 if (array_freelist_empty(&buf->freelist))
386 return false;
387 }
388
389 if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
390 buf->last_scanned = KEY(dc->disk.id, 0, 0);
391 searched_from_start = true;
392 }
393
394 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
395
396 return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start;
397}
398
399static int bch_writeback_thread(void *arg)
400{
401 struct cached_dev *dc = arg;
402 bool searched_full_index;
403
404 while (!kthread_should_stop()) {
405 down_write(&dc->writeback_lock);
406 if (!atomic_read(&dc->has_dirty) ||
407 (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
408 !dc->writeback_running)) {
409 up_write(&dc->writeback_lock);
410 set_current_state(TASK_INTERRUPTIBLE);
411
412 if (kthread_should_stop())
413 return 0;
414
415 try_to_freeze();
416 schedule();
417 continue;
418 }
419
420 searched_full_index = refill_dirty(dc);
421
422 if (searched_full_index &&
423 RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
424 atomic_set(&dc->has_dirty, 0);
425 cached_dev_put(dc);
426 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
427 bch_write_bdev_super(dc, NULL);
428 }
429
430 up_write(&dc->writeback_lock);
431
432 bch_ratelimit_reset(&dc->writeback_rate);
433 read_dirty(dc);
434
435 if (searched_full_index) {
436 unsigned delay = dc->writeback_delay * HZ;
437
438 while (delay &&
439 !kthread_should_stop() &&
440 !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
441 delay = schedule_timeout_interruptible(delay);
442 }
443 }
470 444
471 return 0; 445 return 0;
472} 446}
473 447
448/* Init */
449
450struct sectors_dirty_init {
451 struct btree_op op;
452 unsigned inode;
453};
454
455static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
456 struct bkey *k)
457{
458 struct sectors_dirty_init *op = container_of(_op,
459 struct sectors_dirty_init, op);
460 if (KEY_INODE(k) > op->inode)
461 return MAP_DONE;
462
463 if (KEY_DIRTY(k))
464 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
465 KEY_START(k), KEY_SIZE(k));
466
467 return MAP_CONTINUE;
468}
469
474void bch_sectors_dirty_init(struct cached_dev *dc) 470void bch_sectors_dirty_init(struct cached_dev *dc)
475{ 471{
476 struct btree_op op; 472 struct sectors_dirty_init op;
473
474 bch_btree_op_init(&op.op, -1);
475 op.inode = dc->disk.id;
477 476
478 bch_btree_op_init_stack(&op); 477 bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
479 btree_root(sectors_dirty_init, dc->disk.c, &op, dc); 478 sectors_dirty_init_fn, 0);
480} 479}
481 480
482void bch_cached_dev_writeback_init(struct cached_dev *dc) 481int bch_cached_dev_writeback_init(struct cached_dev *dc)
483{ 482{
484 sema_init(&dc->in_flight, 64); 483 sema_init(&dc->in_flight, 64);
485 closure_init_unlocked(&dc->writeback);
486 init_rwsem(&dc->writeback_lock); 484 init_rwsem(&dc->writeback_lock);
487
488 bch_keybuf_init(&dc->writeback_keys); 485 bch_keybuf_init(&dc->writeback_keys);
489 486
490 dc->writeback_metadata = true; 487 dc->writeback_metadata = true;
@@ -498,22 +495,16 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
498 dc->writeback_rate_p_term_inverse = 64; 495 dc->writeback_rate_p_term_inverse = 64;
499 dc->writeback_rate_d_smooth = 8; 496 dc->writeback_rate_d_smooth = 8;
500 497
498 dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
499 "bcache_writeback");
500 if (IS_ERR(dc->writeback_thread))
501 return PTR_ERR(dc->writeback_thread);
502
503 set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE);
504
501 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); 505 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
502 schedule_delayed_work(&dc->writeback_rate_update, 506 schedule_delayed_work(&dc->writeback_rate_update,
503 dc->writeback_rate_update_seconds * HZ); 507 dc->writeback_rate_update_seconds * HZ);
504}
505
506void bch_writeback_exit(void)
507{
508 if (dirty_wq)
509 destroy_workqueue(dirty_wq);
510}
511
512int __init bch_writeback_init(void)
513{
514 dirty_wq = create_workqueue("bcache_writeback");
515 if (!dirty_wq)
516 return -ENOMEM;
517 508
518 return 0; 509 return 0;
519} 510}
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index c91f61bb95b6..c9ddcf4614b9 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -14,20 +14,27 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
14 return ret; 14 return ret;
15} 15}
16 16
17static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, 17static inline unsigned offset_to_stripe(struct bcache_device *d,
18 uint64_t offset)
19{
20 do_div(offset, d->stripe_size);
21 return offset;
22}
23
24static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
18 uint64_t offset, 25 uint64_t offset,
19 unsigned nr_sectors) 26 unsigned nr_sectors)
20{ 27{
21 uint64_t stripe = offset >> d->stripe_size_bits; 28 unsigned stripe = offset_to_stripe(&dc->disk, offset);
22 29
23 while (1) { 30 while (1) {
24 if (atomic_read(d->stripe_sectors_dirty + stripe)) 31 if (atomic_read(dc->disk.stripe_sectors_dirty + stripe))
25 return true; 32 return true;
26 33
27 if (nr_sectors <= 1 << d->stripe_size_bits) 34 if (nr_sectors <= dc->disk.stripe_size)
28 return false; 35 return false;
29 36
30 nr_sectors -= 1 << d->stripe_size_bits; 37 nr_sectors -= dc->disk.stripe_size;
31 stripe++; 38 stripe++;
32 } 39 }
33} 40}
@@ -38,12 +45,12 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
38 unsigned in_use = dc->disk.c->gc_stats.in_use; 45 unsigned in_use = dc->disk.c->gc_stats.in_use;
39 46
40 if (cache_mode != CACHE_MODE_WRITEBACK || 47 if (cache_mode != CACHE_MODE_WRITEBACK ||
41 atomic_read(&dc->disk.detaching) || 48 test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
42 in_use > CUTOFF_WRITEBACK_SYNC) 49 in_use > CUTOFF_WRITEBACK_SYNC)
43 return false; 50 return false;
44 51
45 if (dc->partial_stripes_expensive && 52 if (dc->partial_stripes_expensive &&
46 bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector, 53 bcache_dev_stripe_dirty(dc, bio->bi_sector,
47 bio_sectors(bio))) 54 bio_sectors(bio)))
48 return true; 55 return true;
49 56
@@ -54,11 +61,30 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
54 in_use <= CUTOFF_WRITEBACK; 61 in_use <= CUTOFF_WRITEBACK;
55} 62}
56 63
64static inline void bch_writeback_queue(struct cached_dev *dc)
65{
66 wake_up_process(dc->writeback_thread);
67}
68
69static inline void bch_writeback_add(struct cached_dev *dc)
70{
71 if (!atomic_read(&dc->has_dirty) &&
72 !atomic_xchg(&dc->has_dirty, 1)) {
73 atomic_inc(&dc->count);
74
75 if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
76 SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
77 /* XXX: should do this synchronously */
78 bch_write_bdev_super(dc, NULL);
79 }
80
81 bch_writeback_queue(dc);
82 }
83}
84
57void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); 85void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
58void bch_writeback_queue(struct cached_dev *);
59void bch_writeback_add(struct cached_dev *);
60 86
61void bch_sectors_dirty_init(struct cached_dev *dc); 87void bch_sectors_dirty_init(struct cached_dev *dc);
62void bch_cached_dev_writeback_init(struct cached_dev *); 88int bch_cached_dev_writeback_init(struct cached_dev *);
63 89
64#endif 90#endif
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 451bf99582ff..846d5c6609d8 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -2978,12 +2978,12 @@ static int dasd_alloc_queue(struct dasd_block *block)
2978 2978
2979 elevator_exit(block->request_queue->elevator); 2979 elevator_exit(block->request_queue->elevator);
2980 block->request_queue->elevator = NULL; 2980 block->request_queue->elevator = NULL;
2981 mutex_lock(&block->request_queue->sysfs_lock);
2981 rc = elevator_init(block->request_queue, "deadline"); 2982 rc = elevator_init(block->request_queue, "deadline");
2982 if (rc) { 2983 if (rc)
2983 blk_cleanup_queue(block->request_queue); 2984 blk_cleanup_queue(block->request_queue);
2984 return rc; 2985 mutex_unlock(&block->request_queue->sysfs_lock);
2985 } 2986 return rc;
2986 return 0;
2987} 2987}
2988 2988
2989/* 2989/*
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 5ebda976ea93..e2b9576d00e2 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -6,11 +6,9 @@
6 6
7#include <linux/tracepoint.h> 7#include <linux/tracepoint.h>
8 8
9struct search;
10
11DECLARE_EVENT_CLASS(bcache_request, 9DECLARE_EVENT_CLASS(bcache_request,
12 TP_PROTO(struct search *s, struct bio *bio), 10 TP_PROTO(struct bcache_device *d, struct bio *bio),
13 TP_ARGS(s, bio), 11 TP_ARGS(d, bio),
14 12
15 TP_STRUCT__entry( 13 TP_STRUCT__entry(
16 __field(dev_t, dev ) 14 __field(dev_t, dev )
@@ -24,8 +22,8 @@ DECLARE_EVENT_CLASS(bcache_request,
24 22
25 TP_fast_assign( 23 TP_fast_assign(
26 __entry->dev = bio->bi_bdev->bd_dev; 24 __entry->dev = bio->bi_bdev->bd_dev;
27 __entry->orig_major = s->d->disk->major; 25 __entry->orig_major = d->disk->major;
28 __entry->orig_minor = s->d->disk->first_minor; 26 __entry->orig_minor = d->disk->first_minor;
29 __entry->sector = bio->bi_sector; 27 __entry->sector = bio->bi_sector;
30 __entry->orig_sector = bio->bi_sector - 16; 28 __entry->orig_sector = bio->bi_sector - 16;
31 __entry->nr_sector = bio->bi_size >> 9; 29 __entry->nr_sector = bio->bi_size >> 9;
@@ -79,13 +77,13 @@ DECLARE_EVENT_CLASS(btree_node,
79/* request.c */ 77/* request.c */
80 78
81DEFINE_EVENT(bcache_request, bcache_request_start, 79DEFINE_EVENT(bcache_request, bcache_request_start,
82 TP_PROTO(struct search *s, struct bio *bio), 80 TP_PROTO(struct bcache_device *d, struct bio *bio),
83 TP_ARGS(s, bio) 81 TP_ARGS(d, bio)
84); 82);
85 83
86DEFINE_EVENT(bcache_request, bcache_request_end, 84DEFINE_EVENT(bcache_request, bcache_request_end,
87 TP_PROTO(struct search *s, struct bio *bio), 85 TP_PROTO(struct bcache_device *d, struct bio *bio),
88 TP_ARGS(s, bio) 86 TP_ARGS(d, bio)
89); 87);
90 88
91DECLARE_EVENT_CLASS(bcache_bio, 89DECLARE_EVENT_CLASS(bcache_bio,
@@ -370,6 +368,35 @@ DEFINE_EVENT(btree_node, bcache_btree_set_root,
370 TP_ARGS(b) 368 TP_ARGS(b)
371); 369);
372 370
371TRACE_EVENT(bcache_keyscan,
372 TP_PROTO(unsigned nr_found,
373 unsigned start_inode, uint64_t start_offset,
374 unsigned end_inode, uint64_t end_offset),
375 TP_ARGS(nr_found,
376 start_inode, start_offset,
377 end_inode, end_offset),
378
379 TP_STRUCT__entry(
380 __field(__u32, nr_found )
381 __field(__u32, start_inode )
382 __field(__u64, start_offset )
383 __field(__u32, end_inode )
384 __field(__u64, end_offset )
385 ),
386
387 TP_fast_assign(
388 __entry->nr_found = nr_found;
389 __entry->start_inode = start_inode;
390 __entry->start_offset = start_offset;
391 __entry->end_inode = end_inode;
392 __entry->end_offset = end_offset;
393 ),
394
395 TP_printk("found %u keys from %u:%llu to %u:%llu", __entry->nr_found,
396 __entry->start_inode, __entry->start_offset,
397 __entry->end_inode, __entry->end_offset)
398);
399
373/* Allocator */ 400/* Allocator */
374 401
375TRACE_EVENT(bcache_alloc_invalidate, 402TRACE_EVENT(bcache_alloc_invalidate,
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
new file mode 100644
index 000000000000..164a7e263988
--- /dev/null
+++ b/include/uapi/linux/bcache.h
@@ -0,0 +1,373 @@
1#ifndef _LINUX_BCACHE_H
2#define _LINUX_BCACHE_H
3
4/*
5 * Bcache on disk data structures
6 */
7
8#include <asm/types.h>
9
10#define BITMASK(name, type, field, offset, size) \
11static inline __u64 name(const type *k) \
12{ return (k->field >> offset) & ~(~0ULL << size); } \
13 \
14static inline void SET_##name(type *k, __u64 v) \
15{ \
16 k->field &= ~(~(~0ULL << size) << offset); \
17 k->field |= (v & ~(~0ULL << size)) << offset; \
18}
19
20/* Btree keys - all units are in sectors */
21
22struct bkey {
23 __u64 high;
24 __u64 low;
25 __u64 ptr[];
26};
27
28#define KEY_FIELD(name, field, offset, size) \
29 BITMASK(name, struct bkey, field, offset, size)
30
31#define PTR_FIELD(name, offset, size) \
32static inline __u64 name(const struct bkey *k, unsigned i) \
33{ return (k->ptr[i] >> offset) & ~(~0ULL << size); } \
34 \
35static inline void SET_##name(struct bkey *k, unsigned i, __u64 v) \
36{ \
37 k->ptr[i] &= ~(~(~0ULL << size) << offset); \
38 k->ptr[i] |= (v & ~(~0ULL << size)) << offset; \
39}
40
41#define KEY_SIZE_BITS 16
42
43KEY_FIELD(KEY_PTRS, high, 60, 3)
44KEY_FIELD(HEADER_SIZE, high, 58, 2)
45KEY_FIELD(KEY_CSUM, high, 56, 2)
46KEY_FIELD(KEY_PINNED, high, 55, 1)
47KEY_FIELD(KEY_DIRTY, high, 36, 1)
48
49KEY_FIELD(KEY_SIZE, high, 20, KEY_SIZE_BITS)
50KEY_FIELD(KEY_INODE, high, 0, 20)
51
52/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
53
54static inline __u64 KEY_OFFSET(const struct bkey *k)
55{
56 return k->low;
57}
58
59static inline void SET_KEY_OFFSET(struct bkey *k, __u64 v)
60{
61 k->low = v;
62}
63
64/*
65 * The high bit being set is a relic from when we used it to do binary
66 * searches - it told you where a key started. It's not used anymore,
67 * and can probably be safely dropped.
68 */
69#define KEY(inode, offset, size) \
70((struct bkey) { \
71 .high = (1ULL << 63) | ((__u64) (size) << 20) | (inode), \
72 .low = (offset) \
73})
74
75#define ZERO_KEY KEY(0, 0, 0)
76
77#define MAX_KEY_INODE (~(~0 << 20))
78#define MAX_KEY_OFFSET (~0ULL >> 1)
79#define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0)
80
81#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k))
82#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0)
83
84#define PTR_DEV_BITS 12
85
86PTR_FIELD(PTR_DEV, 51, PTR_DEV_BITS)
87PTR_FIELD(PTR_OFFSET, 8, 43)
88PTR_FIELD(PTR_GEN, 0, 8)
89
90#define PTR_CHECK_DEV ((1 << PTR_DEV_BITS) - 1)
91
92#define PTR(gen, offset, dev) \
93 ((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen)
94
95/* Bkey utility code */
96
97static inline unsigned long bkey_u64s(const struct bkey *k)
98{
99 return (sizeof(struct bkey) / sizeof(__u64)) + KEY_PTRS(k);
100}
101
102static inline unsigned long bkey_bytes(const struct bkey *k)
103{
104 return bkey_u64s(k) * sizeof(__u64);
105}
106
107#define bkey_copy(_dest, _src) memcpy(_dest, _src, bkey_bytes(_src))
108
109static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
110{
111 SET_KEY_INODE(dest, KEY_INODE(src));
112 SET_KEY_OFFSET(dest, KEY_OFFSET(src));
113}
114
115static inline struct bkey *bkey_next(const struct bkey *k)
116{
117 __u64 *d = (void *) k;
118 return (struct bkey *) (d + bkey_u64s(k));
119}
120
121static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys)
122{
123 __u64 *d = (void *) k;
124 return (struct bkey *) (d + nr_keys);
125}
126/* Enough for a key with 6 pointers */
127#define BKEY_PAD 8
128
129#define BKEY_PADDED(key) \
130 union { struct bkey key; __u64 key ## _pad[BKEY_PAD]; }
131
132/* Superblock */
133
134/* Version 0: Cache device
135 * Version 1: Backing device
136 * Version 2: Seed pointer into btree node checksum
137 * Version 3: Cache device with new UUID format
138 * Version 4: Backing device with data offset
139 */
140#define BCACHE_SB_VERSION_CDEV 0
141#define BCACHE_SB_VERSION_BDEV 1
142#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
143#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
144#define BCACHE_SB_MAX_VERSION 4
145
146#define SB_SECTOR 8
147#define SB_SIZE 4096
148#define SB_LABEL_SIZE 32
149#define SB_JOURNAL_BUCKETS 256U
150/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
151#define MAX_CACHES_PER_SET 8
152
153#define BDEV_DATA_START_DEFAULT 16 /* sectors */
154
155struct cache_sb {
156 __u64 csum;
157 __u64 offset; /* sector where this sb was written */
158 __u64 version;
159
160 __u8 magic[16];
161
162 __u8 uuid[16];
163 union {
164 __u8 set_uuid[16];
165 __u64 set_magic;
166 };
167 __u8 label[SB_LABEL_SIZE];
168
169 __u64 flags;
170 __u64 seq;
171 __u64 pad[8];
172
173 union {
174 struct {
175 /* Cache devices */
176 __u64 nbuckets; /* device size */
177
178 __u16 block_size; /* sectors */
179 __u16 bucket_size; /* sectors */
180
181 __u16 nr_in_set;
182 __u16 nr_this_dev;
183 };
184 struct {
185 /* Backing devices */
186 __u64 data_offset;
187
188 /*
189 * block_size from the cache device section is still used by
190 * backing devices, so don't add anything here until we fix
191 * things to not need it for backing devices anymore
192 */
193 };
194 };
195
196 __u32 last_mount; /* time_t */
197
198 __u16 first_bucket;
199 union {
200 __u16 njournal_buckets;
201 __u16 keys;
202 };
203 __u64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */
204};
205
206static inline _Bool SB_IS_BDEV(const struct cache_sb *sb)
207{
208 return sb->version == BCACHE_SB_VERSION_BDEV
209 || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
210}
211
212BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1);
213BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1);
214BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3);
215#define CACHE_REPLACEMENT_LRU 0U
216#define CACHE_REPLACEMENT_FIFO 1U
217#define CACHE_REPLACEMENT_RANDOM 2U
218
219BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4);
220#define CACHE_MODE_WRITETHROUGH 0U
221#define CACHE_MODE_WRITEBACK 1U
222#define CACHE_MODE_WRITEAROUND 2U
223#define CACHE_MODE_NONE 3U
224BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2);
225#define BDEV_STATE_NONE 0U
226#define BDEV_STATE_CLEAN 1U
227#define BDEV_STATE_DIRTY 2U
228#define BDEV_STATE_STALE 3U
229
230/*
231 * Magic numbers
232 *
233 * The various other data structures have their own magic numbers, which are
234 * xored with the first part of the cache set's UUID
235 */
236
237#define JSET_MAGIC 0x245235c1a3625032ULL
238#define PSET_MAGIC 0x6750e15f87337f91ULL
239#define BSET_MAGIC 0x90135c78b99e07f5ULL
240
241static inline __u64 jset_magic(struct cache_sb *sb)
242{
243 return sb->set_magic ^ JSET_MAGIC;
244}
245
246static inline __u64 pset_magic(struct cache_sb *sb)
247{
248 return sb->set_magic ^ PSET_MAGIC;
249}
250
251static inline __u64 bset_magic(struct cache_sb *sb)
252{
253 return sb->set_magic ^ BSET_MAGIC;
254}
255
256/*
257 * Journal
258 *
259 * On disk format for a journal entry:
260 * seq is monotonically increasing; every journal entry has its own unique
261 * sequence number.
262 *
263 * last_seq is the oldest journal entry that still has keys the btree hasn't
264 * flushed to disk yet.
265 *
266 * version is for on disk format changes.
267 */
268
269#define BCACHE_JSET_VERSION_UUIDv1 1
270#define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */
271#define BCACHE_JSET_VERSION 1
272
273struct jset {
274 __u64 csum;
275 __u64 magic;
276 __u64 seq;
277 __u32 version;
278 __u32 keys;
279
280 __u64 last_seq;
281
282 BKEY_PADDED(uuid_bucket);
283 BKEY_PADDED(btree_root);
284 __u16 btree_level;
285 __u16 pad[3];
286
287 __u64 prio_bucket[MAX_CACHES_PER_SET];
288
289 union {
290 struct bkey start[0];
291 __u64 d[0];
292 };
293};
294
295/* Bucket prios/gens */
296
297struct prio_set {
298 __u64 csum;
299 __u64 magic;
300 __u64 seq;
301 __u32 version;
302 __u32 pad;
303
304 __u64 next_bucket;
305
306 struct bucket_disk {
307 __u16 prio;
308 __u8 gen;
309 } __attribute((packed)) data[];
310};
311
312/* UUIDS - per backing device/flash only volume metadata */
313
314struct uuid_entry {
315 union {
316 struct {
317 __u8 uuid[16];
318 __u8 label[32];
319 __u32 first_reg;
320 __u32 last_reg;
321 __u32 invalidated;
322
323 __u32 flags;
324 /* Size of flash only volumes */
325 __u64 sectors;
326 };
327
328 __u8 pad[128];
329 };
330};
331
332BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1);
333
334/* Btree nodes */
335
336/* Version 1: Seed pointer into btree node checksum
337 */
338#define BCACHE_BSET_CSUM 1
339#define BCACHE_BSET_VERSION 1
340
341/*
342 * Btree nodes
343 *
344 * On disk a btree node is a list/log of these; within each set the keys are
345 * sorted
346 */
347struct bset {
348 __u64 csum;
349 __u64 magic;
350 __u64 seq;
351 __u32 version;
352 __u32 keys;
353
354 union {
355 struct bkey start[0];
356 __u64 d[0];
357 };
358};
359
360/* OBSOLETE */
361
362/* UUIDS - per backing device/flash only volume metadata */
363
364struct uuid_entry_v0 {
365 __u8 uuid[16];
366 __u8 label[32];
367 __u32 first_reg;
368 __u32 last_reg;
369 __u32 invalidated;
370 __u32 pad;
371};
372
373#endif /* _LINUX_BCACHE_H */