diff options
author | Jens Axboe <axboe@kernel.dk> | 2013-11-14 10:29:01 -0500 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2013-11-14 10:29:01 -0500 |
commit | 1355b37f111b35cd6f53078ce63997aec473629f (patch) | |
tree | 590ecf1b148fc631336213a956d8456ce85bdc42 | |
parent | f618ef7c47934d1686a764d0c9f70f23e566683f (diff) | |
parent | c86949486d41d9e7d7681fc72923555114fd702f (diff) |
Merge branch 'for-3.13/post-mq-drivers' into for-linus
50 files changed, 9366 insertions, 3379 deletions
diff --git a/Documentation/blockdev/floppy.txt b/Documentation/blockdev/floppy.txt index 470fe4b5e379..e2240f5ab64d 100644 --- a/Documentation/blockdev/floppy.txt +++ b/Documentation/blockdev/floppy.txt | |||
@@ -39,15 +39,15 @@ Module configuration options | |||
39 | ============================ | 39 | ============================ |
40 | 40 | ||
41 | If you use the floppy driver as a module, use the following syntax: | 41 | If you use the floppy driver as a module, use the following syntax: |
42 | modprobe floppy <options> | 42 | modprobe floppy floppy="<options>" |
43 | 43 | ||
44 | Example: | 44 | Example: |
45 | modprobe floppy omnibook messages | 45 | modprobe floppy floppy="omnibook messages" |
46 | 46 | ||
47 | If you need certain options enabled every time you load the floppy driver, | 47 | If you need certain options enabled every time you load the floppy driver, |
48 | you can put: | 48 | you can put: |
49 | 49 | ||
50 | options floppy omnibook messages | 50 | options floppy floppy="omnibook messages" |
51 | 51 | ||
52 | in a configuration file in /etc/modprobe.d/. | 52 | in a configuration file in /etc/modprobe.d/. |
53 | 53 | ||
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 4682546c5da7..1b84778e9bbd 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig | |||
@@ -110,7 +110,7 @@ source "drivers/block/mtip32xx/Kconfig" | |||
110 | 110 | ||
111 | config BLK_CPQ_DA | 111 | config BLK_CPQ_DA |
112 | tristate "Compaq SMART2 support" | 112 | tristate "Compaq SMART2 support" |
113 | depends on PCI && VIRT_TO_BUS | 113 | depends on PCI && VIRT_TO_BUS && 0 |
114 | help | 114 | help |
115 | This is the driver for Compaq Smart Array controllers. Everyone | 115 | This is the driver for Compaq Smart Array controllers. Everyone |
116 | using these boards should say Y here. See the file | 116 | using these boards should say Y here. See the file |
@@ -319,6 +319,16 @@ config BLK_DEV_NVME | |||
319 | To compile this driver as a module, choose M here: the | 319 | To compile this driver as a module, choose M here: the |
320 | module will be called nvme. | 320 | module will be called nvme. |
321 | 321 | ||
322 | config BLK_DEV_SKD | ||
323 | tristate "STEC S1120 Block Driver" | ||
324 | depends on PCI | ||
325 | depends on 64BIT | ||
326 | ---help--- | ||
327 | Saying Y or M here will enable support for the | ||
328 | STEC, Inc. S1120 PCIe SSD. | ||
329 | |||
330 | Use device /dev/skd$N amd /dev/skd$Np$M. | ||
331 | |||
322 | config BLK_DEV_OSD | 332 | config BLK_DEV_OSD |
323 | tristate "OSD object-as-blkdev support" | 333 | tristate "OSD object-as-blkdev support" |
324 | depends on SCSI_OSD_ULD | 334 | depends on SCSI_OSD_ULD |
diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 03b3b4a2bd8a..8cc98cd0d4a8 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile | |||
@@ -23,6 +23,7 @@ obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o | |||
23 | obj-$(CONFIG_MG_DISK) += mg_disk.o | 23 | obj-$(CONFIG_MG_DISK) += mg_disk.o |
24 | obj-$(CONFIG_SUNVDC) += sunvdc.o | 24 | obj-$(CONFIG_SUNVDC) += sunvdc.o |
25 | obj-$(CONFIG_BLK_DEV_NVME) += nvme.o | 25 | obj-$(CONFIG_BLK_DEV_NVME) += nvme.o |
26 | obj-$(CONFIG_BLK_DEV_SKD) += skd.o | ||
26 | obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o | 27 | obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o |
27 | 28 | ||
28 | obj-$(CONFIG_BLK_DEV_UMEM) += umem.o | 29 | obj-$(CONFIG_BLK_DEV_UMEM) += umem.o |
@@ -44,4 +45,5 @@ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ | |||
44 | obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o | 45 | obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o |
45 | 46 | ||
46 | nvme-y := nvme-core.o nvme-scsi.o | 47 | nvme-y := nvme-core.o nvme-scsi.o |
48 | skd-y := skd_main.o | ||
47 | swim_mod-y := swim.o swim_asm.o | 49 | swim_mod-y := swim.o swim_asm.o |
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index edfa2515bc86..0c004ac05811 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c | |||
@@ -5183,7 +5183,7 @@ reinit_after_soft_reset: | |||
5183 | rebuild_lun_table(h, 1, 0); | 5183 | rebuild_lun_table(h, 1, 0); |
5184 | cciss_engage_scsi(h); | 5184 | cciss_engage_scsi(h); |
5185 | h->busy_initializing = 0; | 5185 | h->busy_initializing = 0; |
5186 | return 1; | 5186 | return 0; |
5187 | 5187 | ||
5188 | clean4: | 5188 | clean4: |
5189 | cciss_free_cmd_pool(h); | 5189 | cciss_free_cmd_pool(h); |
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 2d7f608d181c..0e06f0c5dd1e 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -1474,7 +1474,8 @@ enum determine_dev_size { | |||
1474 | DS_ERROR = -1, | 1474 | DS_ERROR = -1, |
1475 | DS_UNCHANGED = 0, | 1475 | DS_UNCHANGED = 0, |
1476 | DS_SHRUNK = 1, | 1476 | DS_SHRUNK = 1, |
1477 | DS_GREW = 2 | 1477 | DS_GREW = 2, |
1478 | DS_GREW_FROM_ZERO = 3, | ||
1478 | }; | 1479 | }; |
1479 | extern enum determine_dev_size | 1480 | extern enum determine_dev_size |
1480 | drbd_determine_dev_size(struct drbd_conf *, enum dds_flags, struct resize_parms *) __must_hold(local); | 1481 | drbd_determine_dev_size(struct drbd_conf *, enum dds_flags, struct resize_parms *) __must_hold(local); |
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 55635edf563b..9e3818b1bc83 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -2750,13 +2750,6 @@ int __init drbd_init(void) | |||
2750 | return err; | 2750 | return err; |
2751 | } | 2751 | } |
2752 | 2752 | ||
2753 | err = drbd_genl_register(); | ||
2754 | if (err) { | ||
2755 | printk(KERN_ERR "drbd: unable to register generic netlink family\n"); | ||
2756 | goto fail; | ||
2757 | } | ||
2758 | |||
2759 | |||
2760 | register_reboot_notifier(&drbd_notifier); | 2753 | register_reboot_notifier(&drbd_notifier); |
2761 | 2754 | ||
2762 | /* | 2755 | /* |
@@ -2767,6 +2760,15 @@ int __init drbd_init(void) | |||
2767 | drbd_proc = NULL; /* play safe for drbd_cleanup */ | 2760 | drbd_proc = NULL; /* play safe for drbd_cleanup */ |
2768 | idr_init(&minors); | 2761 | idr_init(&minors); |
2769 | 2762 | ||
2763 | rwlock_init(&global_state_lock); | ||
2764 | INIT_LIST_HEAD(&drbd_tconns); | ||
2765 | |||
2766 | err = drbd_genl_register(); | ||
2767 | if (err) { | ||
2768 | printk(KERN_ERR "drbd: unable to register generic netlink family\n"); | ||
2769 | goto fail; | ||
2770 | } | ||
2771 | |||
2770 | err = drbd_create_mempools(); | 2772 | err = drbd_create_mempools(); |
2771 | if (err) | 2773 | if (err) |
2772 | goto fail; | 2774 | goto fail; |
@@ -2778,9 +2780,6 @@ int __init drbd_init(void) | |||
2778 | goto fail; | 2780 | goto fail; |
2779 | } | 2781 | } |
2780 | 2782 | ||
2781 | rwlock_init(&global_state_lock); | ||
2782 | INIT_LIST_HEAD(&drbd_tconns); | ||
2783 | |||
2784 | retry.wq = create_singlethread_workqueue("drbd-reissue"); | 2783 | retry.wq = create_singlethread_workqueue("drbd-reissue"); |
2785 | if (!retry.wq) { | 2784 | if (!retry.wq) { |
2786 | printk(KERN_ERR "drbd: unable to create retry workqueue\n"); | 2785 | printk(KERN_ERR "drbd: unable to create retry workqueue\n"); |
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 8cc1e640f485..c706d50a8b06 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c | |||
@@ -955,7 +955,7 @@ drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags, struct res | |||
955 | } | 955 | } |
956 | 956 | ||
957 | if (size > la_size_sect) | 957 | if (size > la_size_sect) |
958 | rv = DS_GREW; | 958 | rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO; |
959 | if (size < la_size_sect) | 959 | if (size < la_size_sect) |
960 | rv = DS_SHRUNK; | 960 | rv = DS_SHRUNK; |
961 | 961 | ||
@@ -1132,9 +1132,9 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) | |||
1132 | /* We may ignore peer limits if the peer is modern enough. | 1132 | /* We may ignore peer limits if the peer is modern enough. |
1133 | Because new from 8.3.8 onwards the peer can use multiple | 1133 | Because new from 8.3.8 onwards the peer can use multiple |
1134 | BIOs for a single peer_request */ | 1134 | BIOs for a single peer_request */ |
1135 | if (mdev->state.conn >= C_CONNECTED) { | 1135 | if (mdev->state.conn >= C_WF_REPORT_PARAMS) { |
1136 | if (mdev->tconn->agreed_pro_version < 94) | 1136 | if (mdev->tconn->agreed_pro_version < 94) |
1137 | peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); | 1137 | peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); |
1138 | /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ | 1138 | /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ |
1139 | else if (mdev->tconn->agreed_pro_version == 94) | 1139 | else if (mdev->tconn->agreed_pro_version == 94) |
1140 | peer = DRBD_MAX_SIZE_H80_PACKET; | 1140 | peer = DRBD_MAX_SIZE_H80_PACKET; |
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index cc29cd3bf78b..6fa6673b36b3 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c | |||
@@ -1890,29 +1890,11 @@ static u32 seq_max(u32 a, u32 b) | |||
1890 | return seq_greater(a, b) ? a : b; | 1890 | return seq_greater(a, b) ? a : b; |
1891 | } | 1891 | } |
1892 | 1892 | ||
1893 | static bool need_peer_seq(struct drbd_conf *mdev) | ||
1894 | { | ||
1895 | struct drbd_tconn *tconn = mdev->tconn; | ||
1896 | int tp; | ||
1897 | |||
1898 | /* | ||
1899 | * We only need to keep track of the last packet_seq number of our peer | ||
1900 | * if we are in dual-primary mode and we have the resolve-conflicts flag set; see | ||
1901 | * handle_write_conflicts(). | ||
1902 | */ | ||
1903 | |||
1904 | rcu_read_lock(); | ||
1905 | tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries; | ||
1906 | rcu_read_unlock(); | ||
1907 | |||
1908 | return tp && test_bit(RESOLVE_CONFLICTS, &tconn->flags); | ||
1909 | } | ||
1910 | |||
1911 | static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq) | 1893 | static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq) |
1912 | { | 1894 | { |
1913 | unsigned int newest_peer_seq; | 1895 | unsigned int newest_peer_seq; |
1914 | 1896 | ||
1915 | if (need_peer_seq(mdev)) { | 1897 | if (test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags)) { |
1916 | spin_lock(&mdev->peer_seq_lock); | 1898 | spin_lock(&mdev->peer_seq_lock); |
1917 | newest_peer_seq = seq_max(mdev->peer_seq, peer_seq); | 1899 | newest_peer_seq = seq_max(mdev->peer_seq, peer_seq); |
1918 | mdev->peer_seq = newest_peer_seq; | 1900 | mdev->peer_seq = newest_peer_seq; |
@@ -1972,22 +1954,31 @@ static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_s | |||
1972 | { | 1954 | { |
1973 | DEFINE_WAIT(wait); | 1955 | DEFINE_WAIT(wait); |
1974 | long timeout; | 1956 | long timeout; |
1975 | int ret; | 1957 | int ret = 0, tp; |
1976 | 1958 | ||
1977 | if (!need_peer_seq(mdev)) | 1959 | if (!test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags)) |
1978 | return 0; | 1960 | return 0; |
1979 | 1961 | ||
1980 | spin_lock(&mdev->peer_seq_lock); | 1962 | spin_lock(&mdev->peer_seq_lock); |
1981 | for (;;) { | 1963 | for (;;) { |
1982 | if (!seq_greater(peer_seq - 1, mdev->peer_seq)) { | 1964 | if (!seq_greater(peer_seq - 1, mdev->peer_seq)) { |
1983 | mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq); | 1965 | mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq); |
1984 | ret = 0; | ||
1985 | break; | 1966 | break; |
1986 | } | 1967 | } |
1968 | |||
1987 | if (signal_pending(current)) { | 1969 | if (signal_pending(current)) { |
1988 | ret = -ERESTARTSYS; | 1970 | ret = -ERESTARTSYS; |
1989 | break; | 1971 | break; |
1990 | } | 1972 | } |
1973 | |||
1974 | rcu_read_lock(); | ||
1975 | tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries; | ||
1976 | rcu_read_unlock(); | ||
1977 | |||
1978 | if (!tp) | ||
1979 | break; | ||
1980 | |||
1981 | /* Only need to wait if two_primaries is enabled */ | ||
1991 | prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); | 1982 | prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); |
1992 | spin_unlock(&mdev->peer_seq_lock); | 1983 | spin_unlock(&mdev->peer_seq_lock); |
1993 | rcu_read_lock(); | 1984 | rcu_read_lock(); |
@@ -2228,8 +2219,10 @@ static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi) | |||
2228 | } | 2219 | } |
2229 | goto out_interrupted; | 2220 | goto out_interrupted; |
2230 | } | 2221 | } |
2231 | } else | 2222 | } else { |
2223 | update_peer_seq(mdev, peer_seq); | ||
2232 | spin_lock_irq(&mdev->tconn->req_lock); | 2224 | spin_lock_irq(&mdev->tconn->req_lock); |
2225 | } | ||
2233 | list_add(&peer_req->w.list, &mdev->active_ee); | 2226 | list_add(&peer_req->w.list, &mdev->active_ee); |
2234 | spin_unlock_irq(&mdev->tconn->req_lock); | 2227 | spin_unlock_irq(&mdev->tconn->req_lock); |
2235 | 2228 | ||
@@ -4132,7 +4125,11 @@ recv_bm_rle_bits(struct drbd_conf *mdev, | |||
4132 | (unsigned int)bs.buf_len); | 4125 | (unsigned int)bs.buf_len); |
4133 | return -EIO; | 4126 | return -EIO; |
4134 | } | 4127 | } |
4135 | look_ahead >>= bits; | 4128 | /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */ |
4129 | if (likely(bits < 64)) | ||
4130 | look_ahead >>= bits; | ||
4131 | else | ||
4132 | look_ahead = 0; | ||
4136 | have -= bits; | 4133 | have -= bits; |
4137 | 4134 | ||
4138 | bits = bitstream_get_bits(&bs, &tmp, 64 - have); | 4135 | bits = bitstream_get_bits(&bs, &tmp, 64 - have); |
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index c24379ffd4e3..fec7bef44994 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c | |||
@@ -1306,6 +1306,7 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct | |||
1306 | int backing_limit; | 1306 | int backing_limit; |
1307 | 1307 | ||
1308 | if (bio_size && get_ldev(mdev)) { | 1308 | if (bio_size && get_ldev(mdev)) { |
1309 | unsigned int max_hw_sectors = queue_max_hw_sectors(q); | ||
1309 | struct request_queue * const b = | 1310 | struct request_queue * const b = |
1310 | mdev->ldev->backing_bdev->bd_disk->queue; | 1311 | mdev->ldev->backing_bdev->bd_disk->queue; |
1311 | if (b->merge_bvec_fn) { | 1312 | if (b->merge_bvec_fn) { |
@@ -1313,6 +1314,8 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct | |||
1313 | limit = min(limit, backing_limit); | 1314 | limit = min(limit, backing_limit); |
1314 | } | 1315 | } |
1315 | put_ldev(mdev); | 1316 | put_ldev(mdev); |
1317 | if ((limit >> 9) > max_hw_sectors) | ||
1318 | limit = max_hw_sectors << 9; | ||
1316 | } | 1319 | } |
1317 | return limit; | 1320 | return limit; |
1318 | } | 1321 | } |
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index dbdb88a4976c..c8dac7305244 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c | |||
@@ -894,13 +894,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, | |||
894 | 894 | ||
895 | bio_list_init(&lo->lo_bio_list); | 895 | bio_list_init(&lo->lo_bio_list); |
896 | 896 | ||
897 | /* | ||
898 | * set queue make_request_fn, and add limits based on lower level | ||
899 | * device | ||
900 | */ | ||
901 | blk_queue_make_request(lo->lo_queue, loop_make_request); | ||
902 | lo->lo_queue->queuedata = lo; | ||
903 | |||
904 | if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) | 897 | if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) |
905 | blk_queue_flush(lo->lo_queue, REQ_FLUSH); | 898 | blk_queue_flush(lo->lo_queue, REQ_FLUSH); |
906 | 899 | ||
@@ -1618,6 +1611,8 @@ static int loop_add(struct loop_device **l, int i) | |||
1618 | if (!lo) | 1611 | if (!lo) |
1619 | goto out; | 1612 | goto out; |
1620 | 1613 | ||
1614 | lo->lo_state = Lo_unbound; | ||
1615 | |||
1621 | /* allocate id, if @id >= 0, we're requesting that specific id */ | 1616 | /* allocate id, if @id >= 0, we're requesting that specific id */ |
1622 | if (i >= 0) { | 1617 | if (i >= 0) { |
1623 | err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL); | 1618 | err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL); |
@@ -1635,6 +1630,12 @@ static int loop_add(struct loop_device **l, int i) | |||
1635 | if (!lo->lo_queue) | 1630 | if (!lo->lo_queue) |
1636 | goto out_free_idr; | 1631 | goto out_free_idr; |
1637 | 1632 | ||
1633 | /* | ||
1634 | * set queue make_request_fn | ||
1635 | */ | ||
1636 | blk_queue_make_request(lo->lo_queue, loop_make_request); | ||
1637 | lo->lo_queue->queuedata = lo; | ||
1638 | |||
1638 | disk = lo->lo_disk = alloc_disk(1 << part_shift); | 1639 | disk = lo->lo_disk = alloc_disk(1 << part_shift); |
1639 | if (!disk) | 1640 | if (!disk) |
1640 | goto out_free_queue; | 1641 | goto out_free_queue; |
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 77a60bedd7a3..7bc363f1ee82 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c | |||
@@ -936,7 +936,7 @@ static int mg_probe(struct platform_device *plat_dev) | |||
936 | goto probe_err_3b; | 936 | goto probe_err_3b; |
937 | } | 937 | } |
938 | err = request_irq(host->irq, mg_irq, | 938 | err = request_irq(host->irq, mg_irq, |
939 | IRQF_DISABLED | IRQF_TRIGGER_RISING, | 939 | IRQF_TRIGGER_RISING, |
940 | MG_DEV_NAME, host); | 940 | MG_DEV_NAME, host); |
941 | if (err) { | 941 | if (err) { |
942 | printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n", | 942 | printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n", |
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 952dbfe22126..050c71267f14 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c | |||
@@ -126,64 +126,30 @@ struct mtip_compat_ide_task_request_s { | |||
126 | static bool mtip_check_surprise_removal(struct pci_dev *pdev) | 126 | static bool mtip_check_surprise_removal(struct pci_dev *pdev) |
127 | { | 127 | { |
128 | u16 vendor_id = 0; | 128 | u16 vendor_id = 0; |
129 | struct driver_data *dd = pci_get_drvdata(pdev); | ||
130 | |||
131 | if (dd->sr) | ||
132 | return true; | ||
129 | 133 | ||
130 | /* Read the vendorID from the configuration space */ | 134 | /* Read the vendorID from the configuration space */ |
131 | pci_read_config_word(pdev, 0x00, &vendor_id); | 135 | pci_read_config_word(pdev, 0x00, &vendor_id); |
132 | if (vendor_id == 0xFFFF) | 136 | if (vendor_id == 0xFFFF) { |
137 | dd->sr = true; | ||
138 | if (dd->queue) | ||
139 | set_bit(QUEUE_FLAG_DEAD, &dd->queue->queue_flags); | ||
140 | else | ||
141 | dev_warn(&dd->pdev->dev, | ||
142 | "%s: dd->queue is NULL\n", __func__); | ||
143 | if (dd->port) { | ||
144 | set_bit(MTIP_PF_SR_CLEANUP_BIT, &dd->port->flags); | ||
145 | wake_up_interruptible(&dd->port->svc_wait); | ||
146 | } else | ||
147 | dev_warn(&dd->pdev->dev, | ||
148 | "%s: dd->port is NULL\n", __func__); | ||
133 | return true; /* device removed */ | 149 | return true; /* device removed */ |
134 | |||
135 | return false; /* device present */ | ||
136 | } | ||
137 | |||
138 | /* | ||
139 | * This function is called for clean the pending command in the | ||
140 | * command slot during the surprise removal of device and return | ||
141 | * error to the upper layer. | ||
142 | * | ||
143 | * @dd Pointer to the DRIVER_DATA structure. | ||
144 | * | ||
145 | * return value | ||
146 | * None | ||
147 | */ | ||
148 | static void mtip_command_cleanup(struct driver_data *dd) | ||
149 | { | ||
150 | int group = 0, commandslot = 0, commandindex = 0; | ||
151 | struct mtip_cmd *command; | ||
152 | struct mtip_port *port = dd->port; | ||
153 | static int in_progress; | ||
154 | |||
155 | if (in_progress) | ||
156 | return; | ||
157 | |||
158 | in_progress = 1; | ||
159 | |||
160 | for (group = 0; group < 4; group++) { | ||
161 | for (commandslot = 0; commandslot < 32; commandslot++) { | ||
162 | if (!(port->allocated[group] & (1 << commandslot))) | ||
163 | continue; | ||
164 | |||
165 | commandindex = group << 5 | commandslot; | ||
166 | command = &port->commands[commandindex]; | ||
167 | |||
168 | if (atomic_read(&command->active) | ||
169 | && (command->async_callback)) { | ||
170 | command->async_callback(command->async_data, | ||
171 | -ENODEV); | ||
172 | command->async_callback = NULL; | ||
173 | command->async_data = NULL; | ||
174 | } | ||
175 | |||
176 | dma_unmap_sg(&port->dd->pdev->dev, | ||
177 | command->sg, | ||
178 | command->scatter_ents, | ||
179 | command->direction); | ||
180 | } | ||
181 | } | 150 | } |
182 | 151 | ||
183 | up(&port->cmd_slot); | 152 | return false; /* device present */ |
184 | |||
185 | set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag); | ||
186 | in_progress = 0; | ||
187 | } | 153 | } |
188 | 154 | ||
189 | /* | 155 | /* |
@@ -222,10 +188,7 @@ static int get_slot(struct mtip_port *port) | |||
222 | } | 188 | } |
223 | dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n"); | 189 | dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n"); |
224 | 190 | ||
225 | if (mtip_check_surprise_removal(port->dd->pdev)) { | 191 | mtip_check_surprise_removal(port->dd->pdev); |
226 | /* Device not present, clean outstanding commands */ | ||
227 | mtip_command_cleanup(port->dd); | ||
228 | } | ||
229 | return -1; | 192 | return -1; |
230 | } | 193 | } |
231 | 194 | ||
@@ -246,6 +209,107 @@ static inline void release_slot(struct mtip_port *port, int tag) | |||
246 | } | 209 | } |
247 | 210 | ||
248 | /* | 211 | /* |
212 | * IO completion function. | ||
213 | * | ||
214 | * This completion function is called by the driver ISR when a | ||
215 | * command that was issued by the kernel completes. It first calls the | ||
216 | * asynchronous completion function which normally calls back into the block | ||
217 | * layer passing the asynchronous callback data, then unmaps the | ||
218 | * scatter list associated with the completed command, and finally | ||
219 | * clears the allocated bit associated with the completed command. | ||
220 | * | ||
221 | * @port Pointer to the port data structure. | ||
222 | * @tag Tag of the command. | ||
223 | * @data Pointer to driver_data. | ||
224 | * @status Completion status. | ||
225 | * | ||
226 | * return value | ||
227 | * None | ||
228 | */ | ||
229 | static void mtip_async_complete(struct mtip_port *port, | ||
230 | int tag, | ||
231 | void *data, | ||
232 | int status) | ||
233 | { | ||
234 | struct mtip_cmd *command; | ||
235 | struct driver_data *dd = data; | ||
236 | int cb_status = status ? -EIO : 0; | ||
237 | |||
238 | if (unlikely(!dd) || unlikely(!port)) | ||
239 | return; | ||
240 | |||
241 | command = &port->commands[tag]; | ||
242 | |||
243 | if (unlikely(status == PORT_IRQ_TF_ERR)) { | ||
244 | dev_warn(&port->dd->pdev->dev, | ||
245 | "Command tag %d failed due to TFE\n", tag); | ||
246 | } | ||
247 | |||
248 | /* Upper layer callback */ | ||
249 | if (likely(command->async_callback)) | ||
250 | command->async_callback(command->async_data, cb_status); | ||
251 | |||
252 | command->async_callback = NULL; | ||
253 | command->comp_func = NULL; | ||
254 | |||
255 | /* Unmap the DMA scatter list entries */ | ||
256 | dma_unmap_sg(&dd->pdev->dev, | ||
257 | command->sg, | ||
258 | command->scatter_ents, | ||
259 | command->direction); | ||
260 | |||
261 | /* Clear the allocated and active bits for the command */ | ||
262 | atomic_set(&port->commands[tag].active, 0); | ||
263 | release_slot(port, tag); | ||
264 | |||
265 | up(&port->cmd_slot); | ||
266 | } | ||
267 | |||
268 | /* | ||
269 | * This function is called for clean the pending command in the | ||
270 | * command slot during the surprise removal of device and return | ||
271 | * error to the upper layer. | ||
272 | * | ||
273 | * @dd Pointer to the DRIVER_DATA structure. | ||
274 | * | ||
275 | * return value | ||
276 | * None | ||
277 | */ | ||
278 | static void mtip_command_cleanup(struct driver_data *dd) | ||
279 | { | ||
280 | int tag = 0; | ||
281 | struct mtip_cmd *cmd; | ||
282 | struct mtip_port *port = dd->port; | ||
283 | unsigned int num_cmd_slots = dd->slot_groups * 32; | ||
284 | |||
285 | if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) | ||
286 | return; | ||
287 | |||
288 | if (!port) | ||
289 | return; | ||
290 | |||
291 | cmd = &port->commands[MTIP_TAG_INTERNAL]; | ||
292 | if (atomic_read(&cmd->active)) | ||
293 | if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) & | ||
294 | (1 << MTIP_TAG_INTERNAL)) | ||
295 | if (cmd->comp_func) | ||
296 | cmd->comp_func(port, MTIP_TAG_INTERNAL, | ||
297 | cmd->comp_data, -ENODEV); | ||
298 | |||
299 | while (1) { | ||
300 | tag = find_next_bit(port->allocated, num_cmd_slots, tag); | ||
301 | if (tag >= num_cmd_slots) | ||
302 | break; | ||
303 | |||
304 | cmd = &port->commands[tag]; | ||
305 | if (atomic_read(&cmd->active)) | ||
306 | mtip_async_complete(port, tag, dd, -ENODEV); | ||
307 | } | ||
308 | |||
309 | set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag); | ||
310 | } | ||
311 | |||
312 | /* | ||
249 | * Reset the HBA (without sleeping) | 313 | * Reset the HBA (without sleeping) |
250 | * | 314 | * |
251 | * @dd Pointer to the driver data structure. | 315 | * @dd Pointer to the driver data structure. |
@@ -584,6 +648,9 @@ static void mtip_timeout_function(unsigned long int data) | |||
584 | if (unlikely(!port)) | 648 | if (unlikely(!port)) |
585 | return; | 649 | return; |
586 | 650 | ||
651 | if (unlikely(port->dd->sr)) | ||
652 | return; | ||
653 | |||
587 | if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) { | 654 | if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) { |
588 | mod_timer(&port->cmd_timer, | 655 | mod_timer(&port->cmd_timer, |
589 | jiffies + msecs_to_jiffies(30000)); | 656 | jiffies + msecs_to_jiffies(30000)); |
@@ -675,66 +742,6 @@ static void mtip_timeout_function(unsigned long int data) | |||
675 | } | 742 | } |
676 | 743 | ||
677 | /* | 744 | /* |
678 | * IO completion function. | ||
679 | * | ||
680 | * This completion function is called by the driver ISR when a | ||
681 | * command that was issued by the kernel completes. It first calls the | ||
682 | * asynchronous completion function which normally calls back into the block | ||
683 | * layer passing the asynchronous callback data, then unmaps the | ||
684 | * scatter list associated with the completed command, and finally | ||
685 | * clears the allocated bit associated with the completed command. | ||
686 | * | ||
687 | * @port Pointer to the port data structure. | ||
688 | * @tag Tag of the command. | ||
689 | * @data Pointer to driver_data. | ||
690 | * @status Completion status. | ||
691 | * | ||
692 | * return value | ||
693 | * None | ||
694 | */ | ||
695 | static void mtip_async_complete(struct mtip_port *port, | ||
696 | int tag, | ||
697 | void *data, | ||
698 | int status) | ||
699 | { | ||
700 | struct mtip_cmd *command; | ||
701 | struct driver_data *dd = data; | ||
702 | int cb_status = status ? -EIO : 0; | ||
703 | |||
704 | if (unlikely(!dd) || unlikely(!port)) | ||
705 | return; | ||
706 | |||
707 | command = &port->commands[tag]; | ||
708 | |||
709 | if (unlikely(status == PORT_IRQ_TF_ERR)) { | ||
710 | dev_warn(&port->dd->pdev->dev, | ||
711 | "Command tag %d failed due to TFE\n", tag); | ||
712 | } | ||
713 | |||
714 | /* Upper layer callback */ | ||
715 | if (likely(command->async_callback)) | ||
716 | command->async_callback(command->async_data, cb_status); | ||
717 | |||
718 | command->async_callback = NULL; | ||
719 | command->comp_func = NULL; | ||
720 | |||
721 | /* Unmap the DMA scatter list entries */ | ||
722 | dma_unmap_sg(&dd->pdev->dev, | ||
723 | command->sg, | ||
724 | command->scatter_ents, | ||
725 | command->direction); | ||
726 | |||
727 | /* Clear the allocated and active bits for the command */ | ||
728 | atomic_set(&port->commands[tag].active, 0); | ||
729 | release_slot(port, tag); | ||
730 | |||
731 | if (unlikely(command->unaligned)) | ||
732 | up(&port->cmd_slot_unal); | ||
733 | else | ||
734 | up(&port->cmd_slot); | ||
735 | } | ||
736 | |||
737 | /* | ||
738 | * Internal command completion callback function. | 745 | * Internal command completion callback function. |
739 | * | 746 | * |
740 | * This function is normally called by the driver ISR when an internal | 747 | * This function is normally called by the driver ISR when an internal |
@@ -854,7 +861,6 @@ static void mtip_handle_tfe(struct driver_data *dd) | |||
854 | "Missing completion func for tag %d", | 861 | "Missing completion func for tag %d", |
855 | tag); | 862 | tag); |
856 | if (mtip_check_surprise_removal(dd->pdev)) { | 863 | if (mtip_check_surprise_removal(dd->pdev)) { |
857 | mtip_command_cleanup(dd); | ||
858 | /* don't proceed further */ | 864 | /* don't proceed further */ |
859 | return; | 865 | return; |
860 | } | 866 | } |
@@ -1018,14 +1024,12 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, | |||
1018 | command->comp_data, | 1024 | command->comp_data, |
1019 | 0); | 1025 | 0); |
1020 | } else { | 1026 | } else { |
1021 | dev_warn(&dd->pdev->dev, | 1027 | dev_dbg(&dd->pdev->dev, |
1022 | "Null completion " | 1028 | "Null completion for tag %d", |
1023 | "for tag %d", | ||
1024 | tag); | 1029 | tag); |
1025 | 1030 | ||
1026 | if (mtip_check_surprise_removal( | 1031 | if (mtip_check_surprise_removal( |
1027 | dd->pdev)) { | 1032 | dd->pdev)) { |
1028 | mtip_command_cleanup(dd); | ||
1029 | return; | 1033 | return; |
1030 | } | 1034 | } |
1031 | } | 1035 | } |
@@ -1145,7 +1149,6 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data) | |||
1145 | 1149 | ||
1146 | if (unlikely(port_stat & PORT_IRQ_ERR)) { | 1150 | if (unlikely(port_stat & PORT_IRQ_ERR)) { |
1147 | if (unlikely(mtip_check_surprise_removal(dd->pdev))) { | 1151 | if (unlikely(mtip_check_surprise_removal(dd->pdev))) { |
1148 | mtip_command_cleanup(dd); | ||
1149 | /* don't proceed further */ | 1152 | /* don't proceed further */ |
1150 | return IRQ_HANDLED; | 1153 | return IRQ_HANDLED; |
1151 | } | 1154 | } |
@@ -2806,34 +2809,51 @@ static ssize_t show_device_status(struct device_driver *drv, char *buf) | |||
2806 | static ssize_t mtip_hw_read_device_status(struct file *f, char __user *ubuf, | 2809 | static ssize_t mtip_hw_read_device_status(struct file *f, char __user *ubuf, |
2807 | size_t len, loff_t *offset) | 2810 | size_t len, loff_t *offset) |
2808 | { | 2811 | { |
2812 | struct driver_data *dd = (struct driver_data *)f->private_data; | ||
2809 | int size = *offset; | 2813 | int size = *offset; |
2810 | char buf[MTIP_DFS_MAX_BUF_SIZE]; | 2814 | char *buf; |
2815 | int rv = 0; | ||
2811 | 2816 | ||
2812 | if (!len || *offset) | 2817 | if (!len || *offset) |
2813 | return 0; | 2818 | return 0; |
2814 | 2819 | ||
2820 | buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL); | ||
2821 | if (!buf) { | ||
2822 | dev_err(&dd->pdev->dev, | ||
2823 | "Memory allocation: status buffer\n"); | ||
2824 | return -ENOMEM; | ||
2825 | } | ||
2826 | |||
2815 | size += show_device_status(NULL, buf); | 2827 | size += show_device_status(NULL, buf); |
2816 | 2828 | ||
2817 | *offset = size <= len ? size : len; | 2829 | *offset = size <= len ? size : len; |
2818 | size = copy_to_user(ubuf, buf, *offset); | 2830 | size = copy_to_user(ubuf, buf, *offset); |
2819 | if (size) | 2831 | if (size) |
2820 | return -EFAULT; | 2832 | rv = -EFAULT; |
2821 | 2833 | ||
2822 | return *offset; | 2834 | kfree(buf); |
2835 | return rv ? rv : *offset; | ||
2823 | } | 2836 | } |
2824 | 2837 | ||
2825 | static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf, | 2838 | static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf, |
2826 | size_t len, loff_t *offset) | 2839 | size_t len, loff_t *offset) |
2827 | { | 2840 | { |
2828 | struct driver_data *dd = (struct driver_data *)f->private_data; | 2841 | struct driver_data *dd = (struct driver_data *)f->private_data; |
2829 | char buf[MTIP_DFS_MAX_BUF_SIZE]; | 2842 | char *buf; |
2830 | u32 group_allocated; | 2843 | u32 group_allocated; |
2831 | int size = *offset; | 2844 | int size = *offset; |
2832 | int n; | 2845 | int n, rv = 0; |
2833 | 2846 | ||
2834 | if (!len || size) | 2847 | if (!len || size) |
2835 | return 0; | 2848 | return 0; |
2836 | 2849 | ||
2850 | buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL); | ||
2851 | if (!buf) { | ||
2852 | dev_err(&dd->pdev->dev, | ||
2853 | "Memory allocation: register buffer\n"); | ||
2854 | return -ENOMEM; | ||
2855 | } | ||
2856 | |||
2837 | size += sprintf(&buf[size], "H/ S ACTive : [ 0x"); | 2857 | size += sprintf(&buf[size], "H/ S ACTive : [ 0x"); |
2838 | 2858 | ||
2839 | for (n = dd->slot_groups-1; n >= 0; n--) | 2859 | for (n = dd->slot_groups-1; n >= 0; n--) |
@@ -2888,21 +2908,30 @@ static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf, | |||
2888 | *offset = size <= len ? size : len; | 2908 | *offset = size <= len ? size : len; |
2889 | size = copy_to_user(ubuf, buf, *offset); | 2909 | size = copy_to_user(ubuf, buf, *offset); |
2890 | if (size) | 2910 | if (size) |
2891 | return -EFAULT; | 2911 | rv = -EFAULT; |
2892 | 2912 | ||
2893 | return *offset; | 2913 | kfree(buf); |
2914 | return rv ? rv : *offset; | ||
2894 | } | 2915 | } |
2895 | 2916 | ||
2896 | static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf, | 2917 | static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf, |
2897 | size_t len, loff_t *offset) | 2918 | size_t len, loff_t *offset) |
2898 | { | 2919 | { |
2899 | struct driver_data *dd = (struct driver_data *)f->private_data; | 2920 | struct driver_data *dd = (struct driver_data *)f->private_data; |
2900 | char buf[MTIP_DFS_MAX_BUF_SIZE]; | 2921 | char *buf; |
2901 | int size = *offset; | 2922 | int size = *offset; |
2923 | int rv = 0; | ||
2902 | 2924 | ||
2903 | if (!len || size) | 2925 | if (!len || size) |
2904 | return 0; | 2926 | return 0; |
2905 | 2927 | ||
2928 | buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL); | ||
2929 | if (!buf) { | ||
2930 | dev_err(&dd->pdev->dev, | ||
2931 | "Memory allocation: flag buffer\n"); | ||
2932 | return -ENOMEM; | ||
2933 | } | ||
2934 | |||
2906 | size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n", | 2935 | size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n", |
2907 | dd->port->flags); | 2936 | dd->port->flags); |
2908 | size += sprintf(&buf[size], "Flag-dd : [ %08lX ]\n", | 2937 | size += sprintf(&buf[size], "Flag-dd : [ %08lX ]\n", |
@@ -2911,9 +2940,10 @@ static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf, | |||
2911 | *offset = size <= len ? size : len; | 2940 | *offset = size <= len ? size : len; |
2912 | size = copy_to_user(ubuf, buf, *offset); | 2941 | size = copy_to_user(ubuf, buf, *offset); |
2913 | if (size) | 2942 | if (size) |
2914 | return -EFAULT; | 2943 | rv = -EFAULT; |
2915 | 2944 | ||
2916 | return *offset; | 2945 | kfree(buf); |
2946 | return rv ? rv : *offset; | ||
2917 | } | 2947 | } |
2918 | 2948 | ||
2919 | static const struct file_operations mtip_device_status_fops = { | 2949 | static const struct file_operations mtip_device_status_fops = { |
@@ -3006,6 +3036,46 @@ static void mtip_hw_debugfs_exit(struct driver_data *dd) | |||
3006 | debugfs_remove_recursive(dd->dfs_node); | 3036 | debugfs_remove_recursive(dd->dfs_node); |
3007 | } | 3037 | } |
3008 | 3038 | ||
3039 | static int mtip_free_orphan(struct driver_data *dd) | ||
3040 | { | ||
3041 | struct kobject *kobj; | ||
3042 | |||
3043 | if (dd->bdev) { | ||
3044 | if (dd->bdev->bd_holders >= 1) | ||
3045 | return -2; | ||
3046 | |||
3047 | bdput(dd->bdev); | ||
3048 | dd->bdev = NULL; | ||
3049 | } | ||
3050 | |||
3051 | mtip_hw_debugfs_exit(dd); | ||
3052 | |||
3053 | spin_lock(&rssd_index_lock); | ||
3054 | ida_remove(&rssd_index_ida, dd->index); | ||
3055 | spin_unlock(&rssd_index_lock); | ||
3056 | |||
3057 | if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag) && | ||
3058 | test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) { | ||
3059 | put_disk(dd->disk); | ||
3060 | } else { | ||
3061 | if (dd->disk) { | ||
3062 | kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); | ||
3063 | if (kobj) { | ||
3064 | mtip_hw_sysfs_exit(dd, kobj); | ||
3065 | kobject_put(kobj); | ||
3066 | } | ||
3067 | del_gendisk(dd->disk); | ||
3068 | dd->disk = NULL; | ||
3069 | } | ||
3070 | if (dd->queue) { | ||
3071 | dd->queue->queuedata = NULL; | ||
3072 | blk_cleanup_queue(dd->queue); | ||
3073 | dd->queue = NULL; | ||
3074 | } | ||
3075 | } | ||
3076 | kfree(dd); | ||
3077 | return 0; | ||
3078 | } | ||
3009 | 3079 | ||
3010 | /* | 3080 | /* |
3011 | * Perform any init/resume time hardware setup | 3081 | * Perform any init/resume time hardware setup |
@@ -3154,6 +3224,7 @@ static int mtip_service_thread(void *data) | |||
3154 | unsigned long slot, slot_start, slot_wrap; | 3224 | unsigned long slot, slot_start, slot_wrap; |
3155 | unsigned int num_cmd_slots = dd->slot_groups * 32; | 3225 | unsigned int num_cmd_slots = dd->slot_groups * 32; |
3156 | struct mtip_port *port = dd->port; | 3226 | struct mtip_port *port = dd->port; |
3227 | int ret; | ||
3157 | 3228 | ||
3158 | while (1) { | 3229 | while (1) { |
3159 | /* | 3230 | /* |
@@ -3164,13 +3235,18 @@ static int mtip_service_thread(void *data) | |||
3164 | !(port->flags & MTIP_PF_PAUSE_IO)); | 3235 | !(port->flags & MTIP_PF_PAUSE_IO)); |
3165 | 3236 | ||
3166 | if (kthread_should_stop()) | 3237 | if (kthread_should_stop()) |
3238 | goto st_out; | ||
3239 | |||
3240 | set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); | ||
3241 | |||
3242 | /* If I am an orphan, start self cleanup */ | ||
3243 | if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags)) | ||
3167 | break; | 3244 | break; |
3168 | 3245 | ||
3169 | if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, | 3246 | if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, |
3170 | &dd->dd_flag))) | 3247 | &dd->dd_flag))) |
3171 | break; | 3248 | goto st_out; |
3172 | 3249 | ||
3173 | set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); | ||
3174 | if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { | 3250 | if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { |
3175 | slot = 1; | 3251 | slot = 1; |
3176 | /* used to restrict the loop to one iteration */ | 3252 | /* used to restrict the loop to one iteration */ |
@@ -3201,7 +3277,7 @@ static int mtip_service_thread(void *data) | |||
3201 | 3277 | ||
3202 | clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); | 3278 | clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); |
3203 | } else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) { | 3279 | } else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) { |
3204 | if (!mtip_ftl_rebuild_poll(dd)) | 3280 | if (mtip_ftl_rebuild_poll(dd) < 0) |
3205 | set_bit(MTIP_DDF_REBUILD_FAILED_BIT, | 3281 | set_bit(MTIP_DDF_REBUILD_FAILED_BIT, |
3206 | &dd->dd_flag); | 3282 | &dd->dd_flag); |
3207 | clear_bit(MTIP_PF_REBUILD_BIT, &port->flags); | 3283 | clear_bit(MTIP_PF_REBUILD_BIT, &port->flags); |
@@ -3209,8 +3285,30 @@ static int mtip_service_thread(void *data) | |||
3209 | clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); | 3285 | clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); |
3210 | 3286 | ||
3211 | if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) | 3287 | if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) |
3288 | goto st_out; | ||
3289 | } | ||
3290 | |||
3291 | /* wait for pci remove to exit */ | ||
3292 | while (1) { | ||
3293 | if (test_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag)) | ||
3212 | break; | 3294 | break; |
3295 | msleep_interruptible(1000); | ||
3296 | if (kthread_should_stop()) | ||
3297 | goto st_out; | ||
3298 | } | ||
3299 | |||
3300 | while (1) { | ||
3301 | ret = mtip_free_orphan(dd); | ||
3302 | if (!ret) { | ||
3303 | /* NOTE: All data structures are invalid, do not | ||
3304 | * access any here */ | ||
3305 | return 0; | ||
3306 | } | ||
3307 | msleep_interruptible(1000); | ||
3308 | if (kthread_should_stop()) | ||
3309 | goto st_out; | ||
3213 | } | 3310 | } |
3311 | st_out: | ||
3214 | return 0; | 3312 | return 0; |
3215 | } | 3313 | } |
3216 | 3314 | ||
@@ -3437,13 +3535,13 @@ static int mtip_hw_init(struct driver_data *dd) | |||
3437 | rv = -EFAULT; | 3535 | rv = -EFAULT; |
3438 | goto out3; | 3536 | goto out3; |
3439 | } | 3537 | } |
3538 | mtip_dump_identify(dd->port); | ||
3440 | 3539 | ||
3441 | if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) == | 3540 | if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) == |
3442 | MTIP_FTL_REBUILD_MAGIC) { | 3541 | MTIP_FTL_REBUILD_MAGIC) { |
3443 | set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags); | 3542 | set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags); |
3444 | return MTIP_FTL_REBUILD_MAGIC; | 3543 | return MTIP_FTL_REBUILD_MAGIC; |
3445 | } | 3544 | } |
3446 | mtip_dump_identify(dd->port); | ||
3447 | 3545 | ||
3448 | /* check write protect, over temp and rebuild statuses */ | 3546 | /* check write protect, over temp and rebuild statuses */ |
3449 | rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, | 3547 | rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, |
@@ -3467,8 +3565,8 @@ static int mtip_hw_init(struct driver_data *dd) | |||
3467 | } | 3565 | } |
3468 | if (buf[288] == 0xBF) { | 3566 | if (buf[288] == 0xBF) { |
3469 | dev_info(&dd->pdev->dev, | 3567 | dev_info(&dd->pdev->dev, |
3470 | "Drive indicates rebuild has failed.\n"); | 3568 | "Drive is in security locked state.\n"); |
3471 | /* TODO */ | 3569 | set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag); |
3472 | } | 3570 | } |
3473 | } | 3571 | } |
3474 | 3572 | ||
@@ -3523,9 +3621,8 @@ static int mtip_hw_exit(struct driver_data *dd) | |||
3523 | * Send standby immediate (E0h) to the drive so that it | 3621 | * Send standby immediate (E0h) to the drive so that it |
3524 | * saves its state. | 3622 | * saves its state. |
3525 | */ | 3623 | */ |
3526 | if (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) { | 3624 | if (!dd->sr) { |
3527 | 3625 | if (!test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) | |
3528 | if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) | ||
3529 | if (mtip_standby_immediate(dd->port)) | 3626 | if (mtip_standby_immediate(dd->port)) |
3530 | dev_warn(&dd->pdev->dev, | 3627 | dev_warn(&dd->pdev->dev, |
3531 | "STANDBY IMMEDIATE failed\n"); | 3628 | "STANDBY IMMEDIATE failed\n"); |
@@ -3551,6 +3648,7 @@ static int mtip_hw_exit(struct driver_data *dd) | |||
3551 | dd->port->command_list_dma); | 3648 | dd->port->command_list_dma); |
3552 | /* Free the memory allocated for the for structure. */ | 3649 | /* Free the memory allocated for the for structure. */ |
3553 | kfree(dd->port); | 3650 | kfree(dd->port); |
3651 | dd->port = NULL; | ||
3554 | 3652 | ||
3555 | return 0; | 3653 | return 0; |
3556 | } | 3654 | } |
@@ -3572,7 +3670,8 @@ static int mtip_hw_shutdown(struct driver_data *dd) | |||
3572 | * Send standby immediate (E0h) to the drive so that it | 3670 | * Send standby immediate (E0h) to the drive so that it |
3573 | * saves its state. | 3671 | * saves its state. |
3574 | */ | 3672 | */ |
3575 | mtip_standby_immediate(dd->port); | 3673 | if (!dd->sr && dd->port) |
3674 | mtip_standby_immediate(dd->port); | ||
3576 | 3675 | ||
3577 | return 0; | 3676 | return 0; |
3578 | } | 3677 | } |
@@ -3887,6 +3986,10 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) | |||
3887 | bio_endio(bio, -ENODATA); | 3986 | bio_endio(bio, -ENODATA); |
3888 | return; | 3987 | return; |
3889 | } | 3988 | } |
3989 | if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) { | ||
3990 | bio_endio(bio, -ENXIO); | ||
3991 | return; | ||
3992 | } | ||
3890 | } | 3993 | } |
3891 | 3994 | ||
3892 | if (unlikely(bio->bi_rw & REQ_DISCARD)) { | 3995 | if (unlikely(bio->bi_rw & REQ_DISCARD)) { |
@@ -4010,6 +4113,8 @@ static int mtip_block_initialize(struct driver_data *dd) | |||
4010 | dd->disk->private_data = dd; | 4113 | dd->disk->private_data = dd; |
4011 | dd->index = index; | 4114 | dd->index = index; |
4012 | 4115 | ||
4116 | mtip_hw_debugfs_init(dd); | ||
4117 | |||
4013 | /* | 4118 | /* |
4014 | * if rebuild pending, start the service thread, and delay the block | 4119 | * if rebuild pending, start the service thread, and delay the block |
4015 | * queue creation and add_disk() | 4120 | * queue creation and add_disk() |
@@ -4068,6 +4173,7 @@ skip_create_disk: | |||
4068 | /* Enable the block device and add it to /dev */ | 4173 | /* Enable the block device and add it to /dev */ |
4069 | add_disk(dd->disk); | 4174 | add_disk(dd->disk); |
4070 | 4175 | ||
4176 | dd->bdev = bdget_disk(dd->disk, 0); | ||
4071 | /* | 4177 | /* |
4072 | * Now that the disk is active, initialize any sysfs attributes | 4178 | * Now that the disk is active, initialize any sysfs attributes |
4073 | * managed by the protocol layer. | 4179 | * managed by the protocol layer. |
@@ -4077,7 +4183,6 @@ skip_create_disk: | |||
4077 | mtip_hw_sysfs_init(dd, kobj); | 4183 | mtip_hw_sysfs_init(dd, kobj); |
4078 | kobject_put(kobj); | 4184 | kobject_put(kobj); |
4079 | } | 4185 | } |
4080 | mtip_hw_debugfs_init(dd); | ||
4081 | 4186 | ||
4082 | if (dd->mtip_svc_handler) { | 4187 | if (dd->mtip_svc_handler) { |
4083 | set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag); | 4188 | set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag); |
@@ -4103,7 +4208,8 @@ start_service_thread: | |||
4103 | return rv; | 4208 | return rv; |
4104 | 4209 | ||
4105 | kthread_run_error: | 4210 | kthread_run_error: |
4106 | mtip_hw_debugfs_exit(dd); | 4211 | bdput(dd->bdev); |
4212 | dd->bdev = NULL; | ||
4107 | 4213 | ||
4108 | /* Delete our gendisk. This also removes the device from /dev */ | 4214 | /* Delete our gendisk. This also removes the device from /dev */ |
4109 | del_gendisk(dd->disk); | 4215 | del_gendisk(dd->disk); |
@@ -4112,6 +4218,7 @@ read_capacity_error: | |||
4112 | blk_cleanup_queue(dd->queue); | 4218 | blk_cleanup_queue(dd->queue); |
4113 | 4219 | ||
4114 | block_queue_alloc_init_error: | 4220 | block_queue_alloc_init_error: |
4221 | mtip_hw_debugfs_exit(dd); | ||
4115 | disk_index_error: | 4222 | disk_index_error: |
4116 | spin_lock(&rssd_index_lock); | 4223 | spin_lock(&rssd_index_lock); |
4117 | ida_remove(&rssd_index_ida, index); | 4224 | ida_remove(&rssd_index_ida, index); |
@@ -4141,40 +4248,48 @@ static int mtip_block_remove(struct driver_data *dd) | |||
4141 | { | 4248 | { |
4142 | struct kobject *kobj; | 4249 | struct kobject *kobj; |
4143 | 4250 | ||
4144 | if (dd->mtip_svc_handler) { | 4251 | if (!dd->sr) { |
4145 | set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); | 4252 | mtip_hw_debugfs_exit(dd); |
4146 | wake_up_interruptible(&dd->port->svc_wait); | ||
4147 | kthread_stop(dd->mtip_svc_handler); | ||
4148 | } | ||
4149 | 4253 | ||
4150 | /* Clean up the sysfs attributes, if created */ | 4254 | if (dd->mtip_svc_handler) { |
4151 | if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) { | 4255 | set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); |
4152 | kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); | 4256 | wake_up_interruptible(&dd->port->svc_wait); |
4153 | if (kobj) { | 4257 | kthread_stop(dd->mtip_svc_handler); |
4154 | mtip_hw_sysfs_exit(dd, kobj); | ||
4155 | kobject_put(kobj); | ||
4156 | } | 4258 | } |
4157 | } | ||
4158 | mtip_hw_debugfs_exit(dd); | ||
4159 | 4259 | ||
4160 | /* | 4260 | /* Clean up the sysfs attributes, if created */ |
4161 | * Delete our gendisk structure. This also removes the device | 4261 | if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) { |
4162 | * from /dev | 4262 | kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); |
4163 | */ | 4263 | if (kobj) { |
4164 | if (dd->disk) { | 4264 | mtip_hw_sysfs_exit(dd, kobj); |
4165 | if (dd->disk->queue) | 4265 | kobject_put(kobj); |
4166 | del_gendisk(dd->disk); | 4266 | } |
4167 | else | 4267 | } |
4168 | put_disk(dd->disk); | 4268 | /* |
4169 | } | 4269 | * Delete our gendisk structure. This also removes the device |
4170 | 4270 | * from /dev | |
4171 | spin_lock(&rssd_index_lock); | 4271 | */ |
4172 | ida_remove(&rssd_index_ida, dd->index); | 4272 | if (dd->bdev) { |
4173 | spin_unlock(&rssd_index_lock); | 4273 | bdput(dd->bdev); |
4274 | dd->bdev = NULL; | ||
4275 | } | ||
4276 | if (dd->disk) { | ||
4277 | if (dd->disk->queue) { | ||
4278 | del_gendisk(dd->disk); | ||
4279 | blk_cleanup_queue(dd->queue); | ||
4280 | dd->queue = NULL; | ||
4281 | } else | ||
4282 | put_disk(dd->disk); | ||
4283 | } | ||
4284 | dd->disk = NULL; | ||
4174 | 4285 | ||
4175 | blk_cleanup_queue(dd->queue); | 4286 | spin_lock(&rssd_index_lock); |
4176 | dd->disk = NULL; | 4287 | ida_remove(&rssd_index_ida, dd->index); |
4177 | dd->queue = NULL; | 4288 | spin_unlock(&rssd_index_lock); |
4289 | } else { | ||
4290 | dev_info(&dd->pdev->dev, "device %s surprise removal\n", | ||
4291 | dd->disk->disk_name); | ||
4292 | } | ||
4178 | 4293 | ||
4179 | /* De-initialize the protocol layer. */ | 4294 | /* De-initialize the protocol layer. */ |
4180 | mtip_hw_exit(dd); | 4295 | mtip_hw_exit(dd); |
@@ -4490,8 +4605,7 @@ done: | |||
4490 | static void mtip_pci_remove(struct pci_dev *pdev) | 4605 | static void mtip_pci_remove(struct pci_dev *pdev) |
4491 | { | 4606 | { |
4492 | struct driver_data *dd = pci_get_drvdata(pdev); | 4607 | struct driver_data *dd = pci_get_drvdata(pdev); |
4493 | int counter = 0; | 4608 | unsigned long flags, to; |
4494 | unsigned long flags; | ||
4495 | 4609 | ||
4496 | set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); | 4610 | set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); |
4497 | 4611 | ||
@@ -4500,17 +4614,22 @@ static void mtip_pci_remove(struct pci_dev *pdev) | |||
4500 | list_add(&dd->remove_list, &removing_list); | 4614 | list_add(&dd->remove_list, &removing_list); |
4501 | spin_unlock_irqrestore(&dev_lock, flags); | 4615 | spin_unlock_irqrestore(&dev_lock, flags); |
4502 | 4616 | ||
4503 | if (mtip_check_surprise_removal(pdev)) { | 4617 | mtip_check_surprise_removal(pdev); |
4504 | while (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) { | 4618 | synchronize_irq(dd->pdev->irq); |
4505 | counter++; | 4619 | |
4506 | msleep(20); | 4620 | /* Spin until workers are done */ |
4507 | if (counter == 10) { | 4621 | to = jiffies + msecs_to_jiffies(4000); |
4508 | /* Cleanup the outstanding commands */ | 4622 | do { |
4509 | mtip_command_cleanup(dd); | 4623 | msleep(20); |
4510 | break; | 4624 | } while (atomic_read(&dd->irq_workers_active) != 0 && |
4511 | } | 4625 | time_before(jiffies, to)); |
4512 | } | 4626 | |
4627 | if (atomic_read(&dd->irq_workers_active) != 0) { | ||
4628 | dev_warn(&dd->pdev->dev, | ||
4629 | "Completion workers still active!\n"); | ||
4513 | } | 4630 | } |
4631 | /* Cleanup the outstanding commands */ | ||
4632 | mtip_command_cleanup(dd); | ||
4514 | 4633 | ||
4515 | /* Clean up the block layer. */ | 4634 | /* Clean up the block layer. */ |
4516 | mtip_block_remove(dd); | 4635 | mtip_block_remove(dd); |
@@ -4529,8 +4648,15 @@ static void mtip_pci_remove(struct pci_dev *pdev) | |||
4529 | list_del_init(&dd->remove_list); | 4648 | list_del_init(&dd->remove_list); |
4530 | spin_unlock_irqrestore(&dev_lock, flags); | 4649 | spin_unlock_irqrestore(&dev_lock, flags); |
4531 | 4650 | ||
4532 | kfree(dd); | 4651 | if (!dd->sr) |
4652 | kfree(dd); | ||
4653 | else | ||
4654 | set_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag); | ||
4655 | |||
4533 | pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); | 4656 | pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); |
4657 | pci_set_drvdata(pdev, NULL); | ||
4658 | pci_dev_put(pdev); | ||
4659 | |||
4534 | } | 4660 | } |
4535 | 4661 | ||
4536 | /* | 4662 | /* |
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 3bb8a295fbe4..9be7a1582ad3 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h | |||
@@ -140,6 +140,7 @@ enum { | |||
140 | MTIP_PF_SVC_THD_ACTIVE_BIT = 4, | 140 | MTIP_PF_SVC_THD_ACTIVE_BIT = 4, |
141 | MTIP_PF_ISSUE_CMDS_BIT = 5, | 141 | MTIP_PF_ISSUE_CMDS_BIT = 5, |
142 | MTIP_PF_REBUILD_BIT = 6, | 142 | MTIP_PF_REBUILD_BIT = 6, |
143 | MTIP_PF_SR_CLEANUP_BIT = 7, | ||
143 | MTIP_PF_SVC_THD_STOP_BIT = 8, | 144 | MTIP_PF_SVC_THD_STOP_BIT = 8, |
144 | 145 | ||
145 | /* below are bit numbers in 'dd_flag' defined in driver_data */ | 146 | /* below are bit numbers in 'dd_flag' defined in driver_data */ |
@@ -147,15 +148,18 @@ enum { | |||
147 | MTIP_DDF_REMOVE_PENDING_BIT = 1, | 148 | MTIP_DDF_REMOVE_PENDING_BIT = 1, |
148 | MTIP_DDF_OVER_TEMP_BIT = 2, | 149 | MTIP_DDF_OVER_TEMP_BIT = 2, |
149 | MTIP_DDF_WRITE_PROTECT_BIT = 3, | 150 | MTIP_DDF_WRITE_PROTECT_BIT = 3, |
150 | MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | | 151 | MTIP_DDF_REMOVE_DONE_BIT = 4, |
151 | (1 << MTIP_DDF_SEC_LOCK_BIT) | | ||
152 | (1 << MTIP_DDF_OVER_TEMP_BIT) | | ||
153 | (1 << MTIP_DDF_WRITE_PROTECT_BIT)), | ||
154 | |||
155 | MTIP_DDF_CLEANUP_BIT = 5, | 152 | MTIP_DDF_CLEANUP_BIT = 5, |
156 | MTIP_DDF_RESUME_BIT = 6, | 153 | MTIP_DDF_RESUME_BIT = 6, |
157 | MTIP_DDF_INIT_DONE_BIT = 7, | 154 | MTIP_DDF_INIT_DONE_BIT = 7, |
158 | MTIP_DDF_REBUILD_FAILED_BIT = 8, | 155 | MTIP_DDF_REBUILD_FAILED_BIT = 8, |
156 | |||
157 | MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | | ||
158 | (1 << MTIP_DDF_SEC_LOCK_BIT) | | ||
159 | (1 << MTIP_DDF_OVER_TEMP_BIT) | | ||
160 | (1 << MTIP_DDF_WRITE_PROTECT_BIT) | | ||
161 | (1 << MTIP_DDF_REBUILD_FAILED_BIT)), | ||
162 | |||
159 | }; | 163 | }; |
160 | 164 | ||
161 | struct smart_attr { | 165 | struct smart_attr { |
@@ -499,6 +503,8 @@ struct driver_data { | |||
499 | 503 | ||
500 | bool trim_supp; /* flag indicating trim support */ | 504 | bool trim_supp; /* flag indicating trim support */ |
501 | 505 | ||
506 | bool sr; | ||
507 | |||
502 | int numa_node; /* NUMA support */ | 508 | int numa_node; /* NUMA support */ |
503 | 509 | ||
504 | char workq_name[32]; | 510 | char workq_name[32]; |
@@ -511,6 +517,8 @@ struct driver_data { | |||
511 | 517 | ||
512 | int isr_binding; | 518 | int isr_binding; |
513 | 519 | ||
520 | struct block_device *bdev; | ||
521 | |||
514 | int unal_qdepth; /* qdepth of unaligned IO queue */ | 522 | int unal_qdepth; /* qdepth of unaligned IO queue */ |
515 | 523 | ||
516 | struct list_head online_list; /* linkage for online list */ | 524 | struct list_head online_list; /* linkage for online list */ |
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 56188475cfd3..ff8668c5efb1 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c | |||
@@ -473,45 +473,31 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) | |||
473 | { | 473 | { |
474 | if (!pkt_debugfs_root) | 474 | if (!pkt_debugfs_root) |
475 | return; | 475 | return; |
476 | pd->dfs_f_info = NULL; | ||
477 | pd->dfs_d_root = debugfs_create_dir(pd->name, pkt_debugfs_root); | 476 | pd->dfs_d_root = debugfs_create_dir(pd->name, pkt_debugfs_root); |
478 | if (IS_ERR(pd->dfs_d_root)) { | 477 | if (!pd->dfs_d_root) |
479 | pd->dfs_d_root = NULL; | ||
480 | return; | 478 | return; |
481 | } | 479 | |
482 | pd->dfs_f_info = debugfs_create_file("info", S_IRUGO, | 480 | pd->dfs_f_info = debugfs_create_file("info", S_IRUGO, |
483 | pd->dfs_d_root, pd, &debug_fops); | 481 | pd->dfs_d_root, pd, &debug_fops); |
484 | if (IS_ERR(pd->dfs_f_info)) { | ||
485 | pd->dfs_f_info = NULL; | ||
486 | return; | ||
487 | } | ||
488 | } | 482 | } |
489 | 483 | ||
490 | static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd) | 484 | static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd) |
491 | { | 485 | { |
492 | if (!pkt_debugfs_root) | 486 | if (!pkt_debugfs_root) |
493 | return; | 487 | return; |
494 | if (pd->dfs_f_info) | 488 | debugfs_remove(pd->dfs_f_info); |
495 | debugfs_remove(pd->dfs_f_info); | 489 | debugfs_remove(pd->dfs_d_root); |
496 | pd->dfs_f_info = NULL; | 490 | pd->dfs_f_info = NULL; |
497 | if (pd->dfs_d_root) | ||
498 | debugfs_remove(pd->dfs_d_root); | ||
499 | pd->dfs_d_root = NULL; | 491 | pd->dfs_d_root = NULL; |
500 | } | 492 | } |
501 | 493 | ||
502 | static void pkt_debugfs_init(void) | 494 | static void pkt_debugfs_init(void) |
503 | { | 495 | { |
504 | pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL); | 496 | pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL); |
505 | if (IS_ERR(pkt_debugfs_root)) { | ||
506 | pkt_debugfs_root = NULL; | ||
507 | return; | ||
508 | } | ||
509 | } | 497 | } |
510 | 498 | ||
511 | static void pkt_debugfs_cleanup(void) | 499 | static void pkt_debugfs_cleanup(void) |
512 | { | 500 | { |
513 | if (!pkt_debugfs_root) | ||
514 | return; | ||
515 | debugfs_remove(pkt_debugfs_root); | 501 | debugfs_remove(pkt_debugfs_root); |
516 | pkt_debugfs_root = NULL; | 502 | pkt_debugfs_root = NULL; |
517 | } | 503 | } |
diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 6e85e21445eb..a8de2eec6ff3 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c | |||
@@ -654,7 +654,8 @@ static void rsxx_eeh_failure(struct pci_dev *dev) | |||
654 | for (i = 0; i < card->n_targets; i++) { | 654 | for (i = 0; i < card->n_targets; i++) { |
655 | spin_lock_bh(&card->ctrl[i].queue_lock); | 655 | spin_lock_bh(&card->ctrl[i].queue_lock); |
656 | cnt = rsxx_cleanup_dma_queue(&card->ctrl[i], | 656 | cnt = rsxx_cleanup_dma_queue(&card->ctrl[i], |
657 | &card->ctrl[i].queue); | 657 | &card->ctrl[i].queue, |
658 | COMPLETE_DMA); | ||
658 | spin_unlock_bh(&card->ctrl[i].queue_lock); | 659 | spin_unlock_bh(&card->ctrl[i].queue_lock); |
659 | 660 | ||
660 | cnt += rsxx_dma_cancel(&card->ctrl[i]); | 661 | cnt += rsxx_dma_cancel(&card->ctrl[i]); |
@@ -748,10 +749,6 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) | |||
748 | 749 | ||
749 | card->eeh_state = 0; | 750 | card->eeh_state = 0; |
750 | 751 | ||
751 | st = rsxx_eeh_remap_dmas(card); | ||
752 | if (st) | ||
753 | goto failed_remap_dmas; | ||
754 | |||
755 | spin_lock_irqsave(&card->irq_lock, flags); | 752 | spin_lock_irqsave(&card->irq_lock, flags); |
756 | if (card->n_targets & RSXX_MAX_TARGETS) | 753 | if (card->n_targets & RSXX_MAX_TARGETS) |
757 | rsxx_enable_ier_and_isr(card, CR_INTR_ALL_G); | 754 | rsxx_enable_ier_and_isr(card, CR_INTR_ALL_G); |
@@ -778,7 +775,6 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) | |||
778 | return PCI_ERS_RESULT_RECOVERED; | 775 | return PCI_ERS_RESULT_RECOVERED; |
779 | 776 | ||
780 | failed_hw_buffers_init: | 777 | failed_hw_buffers_init: |
781 | failed_remap_dmas: | ||
782 | for (i = 0; i < card->n_targets; i++) { | 778 | for (i = 0; i < card->n_targets; i++) { |
783 | if (card->ctrl[i].status.buf) | 779 | if (card->ctrl[i].status.buf) |
784 | pci_free_consistent(card->dev, | 780 | pci_free_consistent(card->dev, |
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index d7af441880be..2284f5d3a54a 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c | |||
@@ -295,13 +295,15 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) | |||
295 | return -ENOMEM; | 295 | return -ENOMEM; |
296 | } | 296 | } |
297 | 297 | ||
298 | blk_size = card->config.data.block_size; | 298 | if (card->config_valid) { |
299 | blk_size = card->config.data.block_size; | ||
300 | blk_queue_dma_alignment(card->queue, blk_size - 1); | ||
301 | blk_queue_logical_block_size(card->queue, blk_size); | ||
302 | } | ||
299 | 303 | ||
300 | blk_queue_make_request(card->queue, rsxx_make_request); | 304 | blk_queue_make_request(card->queue, rsxx_make_request); |
301 | blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY); | 305 | blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY); |
302 | blk_queue_dma_alignment(card->queue, blk_size - 1); | ||
303 | blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors); | 306 | blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors); |
304 | blk_queue_logical_block_size(card->queue, blk_size); | ||
305 | blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE); | 307 | blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE); |
306 | 308 | ||
307 | queue_flag_set_unlocked(QUEUE_FLAG_NONROT, card->queue); | 309 | queue_flag_set_unlocked(QUEUE_FLAG_NONROT, card->queue); |
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index bed32f16b084..fc88ba3e1bd2 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c | |||
@@ -221,6 +221,21 @@ static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card) | |||
221 | } | 221 | } |
222 | 222 | ||
223 | /*----------------- RSXX DMA Handling -------------------*/ | 223 | /*----------------- RSXX DMA Handling -------------------*/ |
224 | static void rsxx_free_dma(struct rsxx_dma_ctrl *ctrl, struct rsxx_dma *dma) | ||
225 | { | ||
226 | if (dma->cmd != HW_CMD_BLK_DISCARD) { | ||
227 | if (!pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) { | ||
228 | pci_unmap_page(ctrl->card->dev, dma->dma_addr, | ||
229 | get_dma_size(dma), | ||
230 | dma->cmd == HW_CMD_BLK_WRITE ? | ||
231 | PCI_DMA_TODEVICE : | ||
232 | PCI_DMA_FROMDEVICE); | ||
233 | } | ||
234 | } | ||
235 | |||
236 | kmem_cache_free(rsxx_dma_pool, dma); | ||
237 | } | ||
238 | |||
224 | static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl, | 239 | static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl, |
225 | struct rsxx_dma *dma, | 240 | struct rsxx_dma *dma, |
226 | unsigned int status) | 241 | unsigned int status) |
@@ -232,21 +247,14 @@ static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl, | |||
232 | if (status & DMA_CANCELLED) | 247 | if (status & DMA_CANCELLED) |
233 | ctrl->stats.dma_cancelled++; | 248 | ctrl->stats.dma_cancelled++; |
234 | 249 | ||
235 | if (dma->dma_addr) | ||
236 | pci_unmap_page(ctrl->card->dev, dma->dma_addr, | ||
237 | get_dma_size(dma), | ||
238 | dma->cmd == HW_CMD_BLK_WRITE ? | ||
239 | PCI_DMA_TODEVICE : | ||
240 | PCI_DMA_FROMDEVICE); | ||
241 | |||
242 | if (dma->cb) | 250 | if (dma->cb) |
243 | dma->cb(ctrl->card, dma->cb_data, status ? 1 : 0); | 251 | dma->cb(ctrl->card, dma->cb_data, status ? 1 : 0); |
244 | 252 | ||
245 | kmem_cache_free(rsxx_dma_pool, dma); | 253 | rsxx_free_dma(ctrl, dma); |
246 | } | 254 | } |
247 | 255 | ||
248 | int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, | 256 | int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, |
249 | struct list_head *q) | 257 | struct list_head *q, unsigned int done) |
250 | { | 258 | { |
251 | struct rsxx_dma *dma; | 259 | struct rsxx_dma *dma; |
252 | struct rsxx_dma *tmp; | 260 | struct rsxx_dma *tmp; |
@@ -254,7 +262,10 @@ int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, | |||
254 | 262 | ||
255 | list_for_each_entry_safe(dma, tmp, q, list) { | 263 | list_for_each_entry_safe(dma, tmp, q, list) { |
256 | list_del(&dma->list); | 264 | list_del(&dma->list); |
257 | rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); | 265 | if (done & COMPLETE_DMA) |
266 | rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); | ||
267 | else | ||
268 | rsxx_free_dma(ctrl, dma); | ||
258 | cnt++; | 269 | cnt++; |
259 | } | 270 | } |
260 | 271 | ||
@@ -370,7 +381,7 @@ static void dma_engine_stalled(unsigned long data) | |||
370 | 381 | ||
371 | /* Clean up the DMA queue */ | 382 | /* Clean up the DMA queue */ |
372 | spin_lock(&ctrl->queue_lock); | 383 | spin_lock(&ctrl->queue_lock); |
373 | cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue); | 384 | cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA); |
374 | spin_unlock(&ctrl->queue_lock); | 385 | spin_unlock(&ctrl->queue_lock); |
375 | 386 | ||
376 | cnt += rsxx_dma_cancel(ctrl); | 387 | cnt += rsxx_dma_cancel(ctrl); |
@@ -388,6 +399,7 @@ static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl) | |||
388 | int tag; | 399 | int tag; |
389 | int cmds_pending = 0; | 400 | int cmds_pending = 0; |
390 | struct hw_cmd *hw_cmd_buf; | 401 | struct hw_cmd *hw_cmd_buf; |
402 | int dir; | ||
391 | 403 | ||
392 | hw_cmd_buf = ctrl->cmd.buf; | 404 | hw_cmd_buf = ctrl->cmd.buf; |
393 | 405 | ||
@@ -424,6 +436,31 @@ static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl) | |||
424 | continue; | 436 | continue; |
425 | } | 437 | } |
426 | 438 | ||
439 | if (dma->cmd != HW_CMD_BLK_DISCARD) { | ||
440 | if (dma->cmd == HW_CMD_BLK_WRITE) | ||
441 | dir = PCI_DMA_TODEVICE; | ||
442 | else | ||
443 | dir = PCI_DMA_FROMDEVICE; | ||
444 | |||
445 | /* | ||
446 | * The function pci_map_page is placed here because we | ||
447 | * can only, by design, issue up to 255 commands to the | ||
448 | * hardware at one time per DMA channel. So the maximum | ||
449 | * amount of mapped memory would be 255 * 4 channels * | ||
450 | * 4096 Bytes which is less than 2GB, the limit of a x8 | ||
451 | * Non-HWWD PCIe slot. This way the pci_map_page | ||
452 | * function should never fail because of a lack of | ||
453 | * mappable memory. | ||
454 | */ | ||
455 | dma->dma_addr = pci_map_page(ctrl->card->dev, dma->page, | ||
456 | dma->pg_off, dma->sub_page.cnt << 9, dir); | ||
457 | if (pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) { | ||
458 | push_tracker(ctrl->trackers, tag); | ||
459 | rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); | ||
460 | continue; | ||
461 | } | ||
462 | } | ||
463 | |||
427 | set_tracker_dma(ctrl->trackers, tag, dma); | 464 | set_tracker_dma(ctrl->trackers, tag, dma); |
428 | hw_cmd_buf[ctrl->cmd.idx].command = dma->cmd; | 465 | hw_cmd_buf[ctrl->cmd.idx].command = dma->cmd; |
429 | hw_cmd_buf[ctrl->cmd.idx].tag = tag; | 466 | hw_cmd_buf[ctrl->cmd.idx].tag = tag; |
@@ -620,14 +657,6 @@ static int rsxx_queue_dma(struct rsxx_cardinfo *card, | |||
620 | if (!dma) | 657 | if (!dma) |
621 | return -ENOMEM; | 658 | return -ENOMEM; |
622 | 659 | ||
623 | dma->dma_addr = pci_map_page(card->dev, page, pg_off, dma_len, | ||
624 | dir ? PCI_DMA_TODEVICE : | ||
625 | PCI_DMA_FROMDEVICE); | ||
626 | if (!dma->dma_addr) { | ||
627 | kmem_cache_free(rsxx_dma_pool, dma); | ||
628 | return -ENOMEM; | ||
629 | } | ||
630 | |||
631 | dma->cmd = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ; | 660 | dma->cmd = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ; |
632 | dma->laddr = laddr; | 661 | dma->laddr = laddr; |
633 | dma->sub_page.off = (dma_off >> 9); | 662 | dma->sub_page.off = (dma_off >> 9); |
@@ -736,11 +765,9 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, | |||
736 | return 0; | 765 | return 0; |
737 | 766 | ||
738 | bvec_err: | 767 | bvec_err: |
739 | for (i = 0; i < card->n_targets; i++) { | 768 | for (i = 0; i < card->n_targets; i++) |
740 | spin_lock_bh(&card->ctrl[i].queue_lock); | 769 | rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i], |
741 | rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i]); | 770 | FREE_DMA); |
742 | spin_unlock_bh(&card->ctrl[i].queue_lock); | ||
743 | } | ||
744 | 771 | ||
745 | return st; | 772 | return st; |
746 | } | 773 | } |
@@ -990,7 +1017,7 @@ void rsxx_dma_destroy(struct rsxx_cardinfo *card) | |||
990 | 1017 | ||
991 | /* Clean up the DMA queue */ | 1018 | /* Clean up the DMA queue */ |
992 | spin_lock_bh(&ctrl->queue_lock); | 1019 | spin_lock_bh(&ctrl->queue_lock); |
993 | rsxx_cleanup_dma_queue(ctrl, &ctrl->queue); | 1020 | rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA); |
994 | spin_unlock_bh(&ctrl->queue_lock); | 1021 | spin_unlock_bh(&ctrl->queue_lock); |
995 | 1022 | ||
996 | rsxx_dma_cancel(ctrl); | 1023 | rsxx_dma_cancel(ctrl); |
@@ -1032,6 +1059,14 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) | |||
1032 | else | 1059 | else |
1033 | card->ctrl[i].stats.reads_issued--; | 1060 | card->ctrl[i].stats.reads_issued--; |
1034 | 1061 | ||
1062 | if (dma->cmd != HW_CMD_BLK_DISCARD) { | ||
1063 | pci_unmap_page(card->dev, dma->dma_addr, | ||
1064 | get_dma_size(dma), | ||
1065 | dma->cmd == HW_CMD_BLK_WRITE ? | ||
1066 | PCI_DMA_TODEVICE : | ||
1067 | PCI_DMA_FROMDEVICE); | ||
1068 | } | ||
1069 | |||
1035 | list_add_tail(&dma->list, &issued_dmas[i]); | 1070 | list_add_tail(&dma->list, &issued_dmas[i]); |
1036 | push_tracker(card->ctrl[i].trackers, j); | 1071 | push_tracker(card->ctrl[i].trackers, j); |
1037 | cnt++; | 1072 | cnt++; |
@@ -1043,15 +1078,6 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) | |||
1043 | atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth); | 1078 | atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth); |
1044 | card->ctrl[i].stats.sw_q_depth += cnt; | 1079 | card->ctrl[i].stats.sw_q_depth += cnt; |
1045 | card->ctrl[i].e_cnt = 0; | 1080 | card->ctrl[i].e_cnt = 0; |
1046 | |||
1047 | list_for_each_entry(dma, &card->ctrl[i].queue, list) { | ||
1048 | if (dma->dma_addr) | ||
1049 | pci_unmap_page(card->dev, dma->dma_addr, | ||
1050 | get_dma_size(dma), | ||
1051 | dma->cmd == HW_CMD_BLK_WRITE ? | ||
1052 | PCI_DMA_TODEVICE : | ||
1053 | PCI_DMA_FROMDEVICE); | ||
1054 | } | ||
1055 | spin_unlock_bh(&card->ctrl[i].queue_lock); | 1081 | spin_unlock_bh(&card->ctrl[i].queue_lock); |
1056 | } | 1082 | } |
1057 | 1083 | ||
@@ -1060,31 +1086,6 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) | |||
1060 | return 0; | 1086 | return 0; |
1061 | } | 1087 | } |
1062 | 1088 | ||
1063 | int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card) | ||
1064 | { | ||
1065 | struct rsxx_dma *dma; | ||
1066 | int i; | ||
1067 | |||
1068 | for (i = 0; i < card->n_targets; i++) { | ||
1069 | spin_lock_bh(&card->ctrl[i].queue_lock); | ||
1070 | list_for_each_entry(dma, &card->ctrl[i].queue, list) { | ||
1071 | dma->dma_addr = pci_map_page(card->dev, dma->page, | ||
1072 | dma->pg_off, get_dma_size(dma), | ||
1073 | dma->cmd == HW_CMD_BLK_WRITE ? | ||
1074 | PCI_DMA_TODEVICE : | ||
1075 | PCI_DMA_FROMDEVICE); | ||
1076 | if (!dma->dma_addr) { | ||
1077 | spin_unlock_bh(&card->ctrl[i].queue_lock); | ||
1078 | kmem_cache_free(rsxx_dma_pool, dma); | ||
1079 | return -ENOMEM; | ||
1080 | } | ||
1081 | } | ||
1082 | spin_unlock_bh(&card->ctrl[i].queue_lock); | ||
1083 | } | ||
1084 | |||
1085 | return 0; | ||
1086 | } | ||
1087 | |||
1088 | int rsxx_dma_init(void) | 1089 | int rsxx_dma_init(void) |
1089 | { | 1090 | { |
1090 | rsxx_dma_pool = KMEM_CACHE(rsxx_dma, SLAB_HWCACHE_ALIGN); | 1091 | rsxx_dma_pool = KMEM_CACHE(rsxx_dma, SLAB_HWCACHE_ALIGN); |
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h index 5ad5055a4104..6bbc64d0f690 100644 --- a/drivers/block/rsxx/rsxx_priv.h +++ b/drivers/block/rsxx/rsxx_priv.h | |||
@@ -52,7 +52,7 @@ struct proc_cmd; | |||
52 | #define RS70_PCI_REV_SUPPORTED 4 | 52 | #define RS70_PCI_REV_SUPPORTED 4 |
53 | 53 | ||
54 | #define DRIVER_NAME "rsxx" | 54 | #define DRIVER_NAME "rsxx" |
55 | #define DRIVER_VERSION "4.0" | 55 | #define DRIVER_VERSION "4.0.3.2516" |
56 | 56 | ||
57 | /* Block size is 4096 */ | 57 | /* Block size is 4096 */ |
58 | #define RSXX_HW_BLK_SHIFT 12 | 58 | #define RSXX_HW_BLK_SHIFT 12 |
@@ -345,6 +345,11 @@ enum rsxx_creg_stat { | |||
345 | CREG_STAT_TAG_MASK = 0x0000ff00, | 345 | CREG_STAT_TAG_MASK = 0x0000ff00, |
346 | }; | 346 | }; |
347 | 347 | ||
348 | enum rsxx_dma_finish { | ||
349 | FREE_DMA = 0x0, | ||
350 | COMPLETE_DMA = 0x1, | ||
351 | }; | ||
352 | |||
348 | static inline unsigned int CREG_DATA(int N) | 353 | static inline unsigned int CREG_DATA(int N) |
349 | { | 354 | { |
350 | return CREG_DATA0 + (N << 2); | 355 | return CREG_DATA0 + (N << 2); |
@@ -379,7 +384,9 @@ typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card, | |||
379 | int rsxx_dma_setup(struct rsxx_cardinfo *card); | 384 | int rsxx_dma_setup(struct rsxx_cardinfo *card); |
380 | void rsxx_dma_destroy(struct rsxx_cardinfo *card); | 385 | void rsxx_dma_destroy(struct rsxx_cardinfo *card); |
381 | int rsxx_dma_init(void); | 386 | int rsxx_dma_init(void); |
382 | int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, struct list_head *q); | 387 | int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, |
388 | struct list_head *q, | ||
389 | unsigned int done); | ||
383 | int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl); | 390 | int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl); |
384 | void rsxx_dma_cleanup(void); | 391 | void rsxx_dma_cleanup(void); |
385 | void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); | 392 | void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); |
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c new file mode 100644 index 000000000000..9199c93be926 --- /dev/null +++ b/drivers/block/skd_main.c | |||
@@ -0,0 +1,5432 @@ | |||
1 | /* Copyright 2012 STEC, Inc. | ||
2 | * | ||
3 | * This file is licensed under the terms of the 3-clause | ||
4 | * BSD License (http://opensource.org/licenses/BSD-3-Clause) | ||
5 | * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html), | ||
6 | * at your option. Both licenses are also available in the LICENSE file | ||
7 | * distributed with this project. This file may not be copied, modified, | ||
8 | * or distributed except in accordance with those terms. | ||
9 | * Gordoni Waidhofer <gwaidhofer@stec-inc.com> | ||
10 | * Initial Driver Design! | ||
11 | * Thomas Swann <tswann@stec-inc.com> | ||
12 | * Interrupt handling. | ||
13 | * Ramprasad Chinthekindi <rchinthekindi@stec-inc.com> | ||
14 | * biomode implementation. | ||
15 | * Akhil Bhansali <abhansali@stec-inc.com> | ||
16 | * Added support for DISCARD / FLUSH and FUA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/kernel.h> | ||
20 | #include <linux/module.h> | ||
21 | #include <linux/init.h> | ||
22 | #include <linux/pci.h> | ||
23 | #include <linux/slab.h> | ||
24 | #include <linux/spinlock.h> | ||
25 | #include <linux/blkdev.h> | ||
26 | #include <linux/sched.h> | ||
27 | #include <linux/interrupt.h> | ||
28 | #include <linux/compiler.h> | ||
29 | #include <linux/workqueue.h> | ||
30 | #include <linux/bitops.h> | ||
31 | #include <linux/delay.h> | ||
32 | #include <linux/time.h> | ||
33 | #include <linux/hdreg.h> | ||
34 | #include <linux/dma-mapping.h> | ||
35 | #include <linux/completion.h> | ||
36 | #include <linux/scatterlist.h> | ||
37 | #include <linux/version.h> | ||
38 | #include <linux/err.h> | ||
39 | #include <linux/scatterlist.h> | ||
40 | #include <linux/aer.h> | ||
41 | #include <linux/ctype.h> | ||
42 | #include <linux/wait.h> | ||
43 | #include <linux/uio.h> | ||
44 | #include <scsi/scsi.h> | ||
45 | #include <scsi/sg.h> | ||
46 | #include <linux/io.h> | ||
47 | #include <linux/uaccess.h> | ||
48 | #include <asm/unaligned.h> | ||
49 | |||
50 | #include "skd_s1120.h" | ||
51 | |||
52 | static int skd_dbg_level; | ||
53 | static int skd_isr_comp_limit = 4; | ||
54 | |||
55 | enum { | ||
56 | STEC_LINK_2_5GTS = 0, | ||
57 | STEC_LINK_5GTS = 1, | ||
58 | STEC_LINK_8GTS = 2, | ||
59 | STEC_LINK_UNKNOWN = 0xFF | ||
60 | }; | ||
61 | |||
62 | enum { | ||
63 | SKD_FLUSH_INITIALIZER, | ||
64 | SKD_FLUSH_ZERO_SIZE_FIRST, | ||
65 | SKD_FLUSH_DATA_SECOND, | ||
66 | }; | ||
67 | |||
68 | #define SKD_ASSERT(expr) \ | ||
69 | do { \ | ||
70 | if (unlikely(!(expr))) { \ | ||
71 | pr_err("Assertion failed! %s,%s,%s,line=%d\n", \ | ||
72 | # expr, __FILE__, __func__, __LINE__); \ | ||
73 | } \ | ||
74 | } while (0) | ||
75 | |||
76 | #define DRV_NAME "skd" | ||
77 | #define DRV_VERSION "2.2.1" | ||
78 | #define DRV_BUILD_ID "0260" | ||
79 | #define PFX DRV_NAME ": " | ||
80 | #define DRV_BIN_VERSION 0x100 | ||
81 | #define DRV_VER_COMPL "2.2.1." DRV_BUILD_ID | ||
82 | |||
83 | MODULE_AUTHOR("bug-reports: support@stec-inc.com"); | ||
84 | MODULE_LICENSE("Dual BSD/GPL"); | ||
85 | |||
86 | MODULE_DESCRIPTION("STEC s1120 PCIe SSD block driver (b" DRV_BUILD_ID ")"); | ||
87 | MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID); | ||
88 | |||
89 | #define PCI_VENDOR_ID_STEC 0x1B39 | ||
90 | #define PCI_DEVICE_ID_S1120 0x0001 | ||
91 | |||
92 | #define SKD_FUA_NV (1 << 1) | ||
93 | #define SKD_MINORS_PER_DEVICE 16 | ||
94 | |||
95 | #define SKD_MAX_QUEUE_DEPTH 200u | ||
96 | |||
97 | #define SKD_PAUSE_TIMEOUT (5 * 1000) | ||
98 | |||
99 | #define SKD_N_FITMSG_BYTES (512u) | ||
100 | |||
101 | #define SKD_N_SPECIAL_CONTEXT 32u | ||
102 | #define SKD_N_SPECIAL_FITMSG_BYTES (128u) | ||
103 | |||
104 | /* SG elements are 32 bytes, so we can make this 4096 and still be under the | ||
105 | * 128KB limit. That allows 4096*4K = 16M xfer size | ||
106 | */ | ||
107 | #define SKD_N_SG_PER_REQ_DEFAULT 256u | ||
108 | #define SKD_N_SG_PER_SPECIAL 256u | ||
109 | |||
110 | #define SKD_N_COMPLETION_ENTRY 256u | ||
111 | #define SKD_N_READ_CAP_BYTES (8u) | ||
112 | |||
113 | #define SKD_N_INTERNAL_BYTES (512u) | ||
114 | |||
115 | /* 5 bits of uniqifier, 0xF800 */ | ||
116 | #define SKD_ID_INCR (0x400) | ||
117 | #define SKD_ID_TABLE_MASK (3u << 8u) | ||
118 | #define SKD_ID_RW_REQUEST (0u << 8u) | ||
119 | #define SKD_ID_INTERNAL (1u << 8u) | ||
120 | #define SKD_ID_SPECIAL_REQUEST (2u << 8u) | ||
121 | #define SKD_ID_FIT_MSG (3u << 8u) | ||
122 | #define SKD_ID_SLOT_MASK 0x00FFu | ||
123 | #define SKD_ID_SLOT_AND_TABLE_MASK 0x03FFu | ||
124 | |||
125 | #define SKD_N_TIMEOUT_SLOT 4u | ||
126 | #define SKD_TIMEOUT_SLOT_MASK 3u | ||
127 | |||
128 | #define SKD_N_MAX_SECTORS 2048u | ||
129 | |||
130 | #define SKD_MAX_RETRIES 2u | ||
131 | |||
132 | #define SKD_TIMER_SECONDS(seconds) (seconds) | ||
133 | #define SKD_TIMER_MINUTES(minutes) ((minutes) * (60)) | ||
134 | |||
135 | #define INQ_STD_NBYTES 36 | ||
136 | #define SKD_DISCARD_CDB_LENGTH 24 | ||
137 | |||
138 | enum skd_drvr_state { | ||
139 | SKD_DRVR_STATE_LOAD, | ||
140 | SKD_DRVR_STATE_IDLE, | ||
141 | SKD_DRVR_STATE_BUSY, | ||
142 | SKD_DRVR_STATE_STARTING, | ||
143 | SKD_DRVR_STATE_ONLINE, | ||
144 | SKD_DRVR_STATE_PAUSING, | ||
145 | SKD_DRVR_STATE_PAUSED, | ||
146 | SKD_DRVR_STATE_DRAINING_TIMEOUT, | ||
147 | SKD_DRVR_STATE_RESTARTING, | ||
148 | SKD_DRVR_STATE_RESUMING, | ||
149 | SKD_DRVR_STATE_STOPPING, | ||
150 | SKD_DRVR_STATE_FAULT, | ||
151 | SKD_DRVR_STATE_DISAPPEARED, | ||
152 | SKD_DRVR_STATE_PROTOCOL_MISMATCH, | ||
153 | SKD_DRVR_STATE_BUSY_ERASE, | ||
154 | SKD_DRVR_STATE_BUSY_SANITIZE, | ||
155 | SKD_DRVR_STATE_BUSY_IMMINENT, | ||
156 | SKD_DRVR_STATE_WAIT_BOOT, | ||
157 | SKD_DRVR_STATE_SYNCING, | ||
158 | }; | ||
159 | |||
160 | #define SKD_WAIT_BOOT_TIMO SKD_TIMER_SECONDS(90u) | ||
161 | #define SKD_STARTING_TIMO SKD_TIMER_SECONDS(8u) | ||
162 | #define SKD_RESTARTING_TIMO SKD_TIMER_MINUTES(4u) | ||
163 | #define SKD_DRAINING_TIMO SKD_TIMER_SECONDS(6u) | ||
164 | #define SKD_BUSY_TIMO SKD_TIMER_MINUTES(20u) | ||
165 | #define SKD_STARTED_BUSY_TIMO SKD_TIMER_SECONDS(60u) | ||
166 | #define SKD_START_WAIT_SECONDS 90u | ||
167 | |||
168 | enum skd_req_state { | ||
169 | SKD_REQ_STATE_IDLE, | ||
170 | SKD_REQ_STATE_SETUP, | ||
171 | SKD_REQ_STATE_BUSY, | ||
172 | SKD_REQ_STATE_COMPLETED, | ||
173 | SKD_REQ_STATE_TIMEOUT, | ||
174 | SKD_REQ_STATE_ABORTED, | ||
175 | }; | ||
176 | |||
177 | enum skd_fit_msg_state { | ||
178 | SKD_MSG_STATE_IDLE, | ||
179 | SKD_MSG_STATE_BUSY, | ||
180 | }; | ||
181 | |||
182 | enum skd_check_status_action { | ||
183 | SKD_CHECK_STATUS_REPORT_GOOD, | ||
184 | SKD_CHECK_STATUS_REPORT_SMART_ALERT, | ||
185 | SKD_CHECK_STATUS_REQUEUE_REQUEST, | ||
186 | SKD_CHECK_STATUS_REPORT_ERROR, | ||
187 | SKD_CHECK_STATUS_BUSY_IMMINENT, | ||
188 | }; | ||
189 | |||
190 | struct skd_fitmsg_context { | ||
191 | enum skd_fit_msg_state state; | ||
192 | |||
193 | struct skd_fitmsg_context *next; | ||
194 | |||
195 | u32 id; | ||
196 | u16 outstanding; | ||
197 | |||
198 | u32 length; | ||
199 | u32 offset; | ||
200 | |||
201 | u8 *msg_buf; | ||
202 | dma_addr_t mb_dma_address; | ||
203 | }; | ||
204 | |||
205 | struct skd_request_context { | ||
206 | enum skd_req_state state; | ||
207 | |||
208 | struct skd_request_context *next; | ||
209 | |||
210 | u16 id; | ||
211 | u32 fitmsg_id; | ||
212 | |||
213 | struct request *req; | ||
214 | u8 flush_cmd; | ||
215 | u8 discard_page; | ||
216 | |||
217 | u32 timeout_stamp; | ||
218 | u8 sg_data_dir; | ||
219 | struct scatterlist *sg; | ||
220 | u32 n_sg; | ||
221 | u32 sg_byte_count; | ||
222 | |||
223 | struct fit_sg_descriptor *sksg_list; | ||
224 | dma_addr_t sksg_dma_address; | ||
225 | |||
226 | struct fit_completion_entry_v1 completion; | ||
227 | |||
228 | struct fit_comp_error_info err_info; | ||
229 | |||
230 | }; | ||
231 | #define SKD_DATA_DIR_HOST_TO_CARD 1 | ||
232 | #define SKD_DATA_DIR_CARD_TO_HOST 2 | ||
233 | #define SKD_DATA_DIR_NONE 3 /* especially for DISCARD requests. */ | ||
234 | |||
235 | struct skd_special_context { | ||
236 | struct skd_request_context req; | ||
237 | |||
238 | u8 orphaned; | ||
239 | |||
240 | void *data_buf; | ||
241 | dma_addr_t db_dma_address; | ||
242 | |||
243 | u8 *msg_buf; | ||
244 | dma_addr_t mb_dma_address; | ||
245 | }; | ||
246 | |||
247 | struct skd_sg_io { | ||
248 | fmode_t mode; | ||
249 | void __user *argp; | ||
250 | |||
251 | struct sg_io_hdr sg; | ||
252 | |||
253 | u8 cdb[16]; | ||
254 | |||
255 | u32 dxfer_len; | ||
256 | u32 iovcnt; | ||
257 | struct sg_iovec *iov; | ||
258 | struct sg_iovec no_iov_iov; | ||
259 | |||
260 | struct skd_special_context *skspcl; | ||
261 | }; | ||
262 | |||
263 | typedef enum skd_irq_type { | ||
264 | SKD_IRQ_LEGACY, | ||
265 | SKD_IRQ_MSI, | ||
266 | SKD_IRQ_MSIX | ||
267 | } skd_irq_type_t; | ||
268 | |||
269 | #define SKD_MAX_BARS 2 | ||
270 | |||
271 | struct skd_device { | ||
272 | volatile void __iomem *mem_map[SKD_MAX_BARS]; | ||
273 | resource_size_t mem_phys[SKD_MAX_BARS]; | ||
274 | u32 mem_size[SKD_MAX_BARS]; | ||
275 | |||
276 | skd_irq_type_t irq_type; | ||
277 | u32 msix_count; | ||
278 | struct skd_msix_entry *msix_entries; | ||
279 | |||
280 | struct pci_dev *pdev; | ||
281 | int pcie_error_reporting_is_enabled; | ||
282 | |||
283 | spinlock_t lock; | ||
284 | struct gendisk *disk; | ||
285 | struct request_queue *queue; | ||
286 | struct device *class_dev; | ||
287 | int gendisk_on; | ||
288 | int sync_done; | ||
289 | |||
290 | atomic_t device_count; | ||
291 | u32 devno; | ||
292 | u32 major; | ||
293 | char name[32]; | ||
294 | char isr_name[30]; | ||
295 | |||
296 | enum skd_drvr_state state; | ||
297 | u32 drive_state; | ||
298 | |||
299 | u32 in_flight; | ||
300 | u32 cur_max_queue_depth; | ||
301 | u32 queue_low_water_mark; | ||
302 | u32 dev_max_queue_depth; | ||
303 | |||
304 | u32 num_fitmsg_context; | ||
305 | u32 num_req_context; | ||
306 | |||
307 | u32 timeout_slot[SKD_N_TIMEOUT_SLOT]; | ||
308 | u32 timeout_stamp; | ||
309 | struct skd_fitmsg_context *skmsg_free_list; | ||
310 | struct skd_fitmsg_context *skmsg_table; | ||
311 | |||
312 | struct skd_request_context *skreq_free_list; | ||
313 | struct skd_request_context *skreq_table; | ||
314 | |||
315 | struct skd_special_context *skspcl_free_list; | ||
316 | struct skd_special_context *skspcl_table; | ||
317 | |||
318 | struct skd_special_context internal_skspcl; | ||
319 | u32 read_cap_blocksize; | ||
320 | u32 read_cap_last_lba; | ||
321 | int read_cap_is_valid; | ||
322 | int inquiry_is_valid; | ||
323 | u8 inq_serial_num[13]; /*12 chars plus null term */ | ||
324 | u8 id_str[80]; /* holds a composite name (pci + sernum) */ | ||
325 | |||
326 | u8 skcomp_cycle; | ||
327 | u32 skcomp_ix; | ||
328 | struct fit_completion_entry_v1 *skcomp_table; | ||
329 | struct fit_comp_error_info *skerr_table; | ||
330 | dma_addr_t cq_dma_address; | ||
331 | |||
332 | wait_queue_head_t waitq; | ||
333 | |||
334 | struct timer_list timer; | ||
335 | u32 timer_countdown; | ||
336 | u32 timer_substate; | ||
337 | |||
338 | int n_special; | ||
339 | int sgs_per_request; | ||
340 | u32 last_mtd; | ||
341 | |||
342 | u32 proto_ver; | ||
343 | |||
344 | int dbg_level; | ||
345 | u32 connect_time_stamp; | ||
346 | int connect_retries; | ||
347 | #define SKD_MAX_CONNECT_RETRIES 16 | ||
348 | u32 drive_jiffies; | ||
349 | |||
350 | u32 timo_slot; | ||
351 | |||
352 | |||
353 | struct work_struct completion_worker; | ||
354 | }; | ||
355 | |||
356 | #define SKD_WRITEL(DEV, VAL, OFF) skd_reg_write32(DEV, VAL, OFF) | ||
357 | #define SKD_READL(DEV, OFF) skd_reg_read32(DEV, OFF) | ||
358 | #define SKD_WRITEQ(DEV, VAL, OFF) skd_reg_write64(DEV, VAL, OFF) | ||
359 | |||
360 | static inline u32 skd_reg_read32(struct skd_device *skdev, u32 offset) | ||
361 | { | ||
362 | u32 val; | ||
363 | |||
364 | if (likely(skdev->dbg_level < 2)) | ||
365 | return readl(skdev->mem_map[1] + offset); | ||
366 | else { | ||
367 | barrier(); | ||
368 | val = readl(skdev->mem_map[1] + offset); | ||
369 | barrier(); | ||
370 | pr_debug("%s:%s:%d offset %x = %x\n", | ||
371 | skdev->name, __func__, __LINE__, offset, val); | ||
372 | return val; | ||
373 | } | ||
374 | |||
375 | } | ||
376 | |||
377 | static inline void skd_reg_write32(struct skd_device *skdev, u32 val, | ||
378 | u32 offset) | ||
379 | { | ||
380 | if (likely(skdev->dbg_level < 2)) { | ||
381 | writel(val, skdev->mem_map[1] + offset); | ||
382 | barrier(); | ||
383 | } else { | ||
384 | barrier(); | ||
385 | writel(val, skdev->mem_map[1] + offset); | ||
386 | barrier(); | ||
387 | pr_debug("%s:%s:%d offset %x = %x\n", | ||
388 | skdev->name, __func__, __LINE__, offset, val); | ||
389 | } | ||
390 | } | ||
391 | |||
392 | static inline void skd_reg_write64(struct skd_device *skdev, u64 val, | ||
393 | u32 offset) | ||
394 | { | ||
395 | if (likely(skdev->dbg_level < 2)) { | ||
396 | writeq(val, skdev->mem_map[1] + offset); | ||
397 | barrier(); | ||
398 | } else { | ||
399 | barrier(); | ||
400 | writeq(val, skdev->mem_map[1] + offset); | ||
401 | barrier(); | ||
402 | pr_debug("%s:%s:%d offset %x = %016llx\n", | ||
403 | skdev->name, __func__, __LINE__, offset, val); | ||
404 | } | ||
405 | } | ||
406 | |||
407 | |||
408 | #define SKD_IRQ_DEFAULT SKD_IRQ_MSI | ||
409 | static int skd_isr_type = SKD_IRQ_DEFAULT; | ||
410 | |||
411 | module_param(skd_isr_type, int, 0444); | ||
412 | MODULE_PARM_DESC(skd_isr_type, "Interrupt type capability." | ||
413 | " (0==legacy, 1==MSI, 2==MSI-X, default==1)"); | ||
414 | |||
415 | #define SKD_MAX_REQ_PER_MSG_DEFAULT 1 | ||
416 | static int skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT; | ||
417 | |||
418 | module_param(skd_max_req_per_msg, int, 0444); | ||
419 | MODULE_PARM_DESC(skd_max_req_per_msg, | ||
420 | "Maximum SCSI requests packed in a single message." | ||
421 | " (1-14, default==1)"); | ||
422 | |||
423 | #define SKD_MAX_QUEUE_DEPTH_DEFAULT 64 | ||
424 | #define SKD_MAX_QUEUE_DEPTH_DEFAULT_STR "64" | ||
425 | static int skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT; | ||
426 | |||
427 | module_param(skd_max_queue_depth, int, 0444); | ||
428 | MODULE_PARM_DESC(skd_max_queue_depth, | ||
429 | "Maximum SCSI requests issued to s1120." | ||
430 | " (1-200, default==" SKD_MAX_QUEUE_DEPTH_DEFAULT_STR ")"); | ||
431 | |||
432 | static int skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT; | ||
433 | module_param(skd_sgs_per_request, int, 0444); | ||
434 | MODULE_PARM_DESC(skd_sgs_per_request, | ||
435 | "Maximum SG elements per block request." | ||
436 | " (1-4096, default==256)"); | ||
437 | |||
438 | static int skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT; | ||
439 | module_param(skd_max_pass_thru, int, 0444); | ||
440 | MODULE_PARM_DESC(skd_max_pass_thru, | ||
441 | "Maximum SCSI pass-thru at a time." " (1-50, default==32)"); | ||
442 | |||
443 | module_param(skd_dbg_level, int, 0444); | ||
444 | MODULE_PARM_DESC(skd_dbg_level, "s1120 debug level (0,1,2)"); | ||
445 | |||
446 | module_param(skd_isr_comp_limit, int, 0444); | ||
447 | MODULE_PARM_DESC(skd_isr_comp_limit, "s1120 isr comp limit (0=none) default=4"); | ||
448 | |||
449 | /* Major device number dynamically assigned. */ | ||
450 | static u32 skd_major; | ||
451 | |||
452 | static void skd_destruct(struct skd_device *skdev); | ||
453 | static const struct block_device_operations skd_blockdev_ops; | ||
454 | static void skd_send_fitmsg(struct skd_device *skdev, | ||
455 | struct skd_fitmsg_context *skmsg); | ||
456 | static void skd_send_special_fitmsg(struct skd_device *skdev, | ||
457 | struct skd_special_context *skspcl); | ||
458 | static void skd_request_fn(struct request_queue *rq); | ||
459 | static void skd_end_request(struct skd_device *skdev, | ||
460 | struct skd_request_context *skreq, int error); | ||
461 | static int skd_preop_sg_list(struct skd_device *skdev, | ||
462 | struct skd_request_context *skreq); | ||
463 | static void skd_postop_sg_list(struct skd_device *skdev, | ||
464 | struct skd_request_context *skreq); | ||
465 | |||
466 | static void skd_restart_device(struct skd_device *skdev); | ||
467 | static int skd_quiesce_dev(struct skd_device *skdev); | ||
468 | static int skd_unquiesce_dev(struct skd_device *skdev); | ||
469 | static void skd_release_special(struct skd_device *skdev, | ||
470 | struct skd_special_context *skspcl); | ||
471 | static void skd_disable_interrupts(struct skd_device *skdev); | ||
472 | static void skd_isr_fwstate(struct skd_device *skdev); | ||
473 | static void skd_recover_requests(struct skd_device *skdev, int requeue); | ||
474 | static void skd_soft_reset(struct skd_device *skdev); | ||
475 | |||
476 | static const char *skd_name(struct skd_device *skdev); | ||
477 | const char *skd_drive_state_to_str(int state); | ||
478 | const char *skd_skdev_state_to_str(enum skd_drvr_state state); | ||
479 | static void skd_log_skdev(struct skd_device *skdev, const char *event); | ||
480 | static void skd_log_skmsg(struct skd_device *skdev, | ||
481 | struct skd_fitmsg_context *skmsg, const char *event); | ||
482 | static void skd_log_skreq(struct skd_device *skdev, | ||
483 | struct skd_request_context *skreq, const char *event); | ||
484 | |||
485 | /* | ||
486 | ***************************************************************************** | ||
487 | * READ/WRITE REQUESTS | ||
488 | ***************************************************************************** | ||
489 | */ | ||
490 | static void skd_fail_all_pending(struct skd_device *skdev) | ||
491 | { | ||
492 | struct request_queue *q = skdev->queue; | ||
493 | struct request *req; | ||
494 | |||
495 | for (;; ) { | ||
496 | req = blk_peek_request(q); | ||
497 | if (req == NULL) | ||
498 | break; | ||
499 | blk_start_request(req); | ||
500 | __blk_end_request_all(req, -EIO); | ||
501 | } | ||
502 | } | ||
503 | |||
504 | static void | ||
505 | skd_prep_rw_cdb(struct skd_scsi_request *scsi_req, | ||
506 | int data_dir, unsigned lba, | ||
507 | unsigned count) | ||
508 | { | ||
509 | if (data_dir == READ) | ||
510 | scsi_req->cdb[0] = 0x28; | ||
511 | else | ||
512 | scsi_req->cdb[0] = 0x2a; | ||
513 | |||
514 | scsi_req->cdb[1] = 0; | ||
515 | scsi_req->cdb[2] = (lba & 0xff000000) >> 24; | ||
516 | scsi_req->cdb[3] = (lba & 0xff0000) >> 16; | ||
517 | scsi_req->cdb[4] = (lba & 0xff00) >> 8; | ||
518 | scsi_req->cdb[5] = (lba & 0xff); | ||
519 | scsi_req->cdb[6] = 0; | ||
520 | scsi_req->cdb[7] = (count & 0xff00) >> 8; | ||
521 | scsi_req->cdb[8] = count & 0xff; | ||
522 | scsi_req->cdb[9] = 0; | ||
523 | } | ||
524 | |||
525 | static void | ||
526 | skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req, | ||
527 | struct skd_request_context *skreq) | ||
528 | { | ||
529 | skreq->flush_cmd = 1; | ||
530 | |||
531 | scsi_req->cdb[0] = 0x35; | ||
532 | scsi_req->cdb[1] = 0; | ||
533 | scsi_req->cdb[2] = 0; | ||
534 | scsi_req->cdb[3] = 0; | ||
535 | scsi_req->cdb[4] = 0; | ||
536 | scsi_req->cdb[5] = 0; | ||
537 | scsi_req->cdb[6] = 0; | ||
538 | scsi_req->cdb[7] = 0; | ||
539 | scsi_req->cdb[8] = 0; | ||
540 | scsi_req->cdb[9] = 0; | ||
541 | } | ||
542 | |||
543 | static void | ||
544 | skd_prep_discard_cdb(struct skd_scsi_request *scsi_req, | ||
545 | struct skd_request_context *skreq, | ||
546 | struct page *page, | ||
547 | u32 lba, u32 count) | ||
548 | { | ||
549 | char *buf; | ||
550 | unsigned long len; | ||
551 | struct request *req; | ||
552 | |||
553 | buf = page_address(page); | ||
554 | len = SKD_DISCARD_CDB_LENGTH; | ||
555 | |||
556 | scsi_req->cdb[0] = UNMAP; | ||
557 | scsi_req->cdb[8] = len; | ||
558 | |||
559 | put_unaligned_be16(6 + 16, &buf[0]); | ||
560 | put_unaligned_be16(16, &buf[2]); | ||
561 | put_unaligned_be64(lba, &buf[8]); | ||
562 | put_unaligned_be32(count, &buf[16]); | ||
563 | |||
564 | req = skreq->req; | ||
565 | blk_add_request_payload(req, page, len); | ||
566 | req->buffer = buf; | ||
567 | } | ||
568 | |||
569 | static void skd_request_fn_not_online(struct request_queue *q); | ||
570 | |||
571 | static void skd_request_fn(struct request_queue *q) | ||
572 | { | ||
573 | struct skd_device *skdev = q->queuedata; | ||
574 | struct skd_fitmsg_context *skmsg = NULL; | ||
575 | struct fit_msg_hdr *fmh = NULL; | ||
576 | struct skd_request_context *skreq; | ||
577 | struct request *req = NULL; | ||
578 | struct skd_scsi_request *scsi_req; | ||
579 | struct page *page; | ||
580 | unsigned long io_flags; | ||
581 | int error; | ||
582 | u32 lba; | ||
583 | u32 count; | ||
584 | int data_dir; | ||
585 | u32 be_lba; | ||
586 | u32 be_count; | ||
587 | u64 be_dmaa; | ||
588 | u64 cmdctxt; | ||
589 | u32 timo_slot; | ||
590 | void *cmd_ptr; | ||
591 | int flush, fua; | ||
592 | |||
593 | if (skdev->state != SKD_DRVR_STATE_ONLINE) { | ||
594 | skd_request_fn_not_online(q); | ||
595 | return; | ||
596 | } | ||
597 | |||
598 | if (blk_queue_stopped(skdev->queue)) { | ||
599 | if (skdev->skmsg_free_list == NULL || | ||
600 | skdev->skreq_free_list == NULL || | ||
601 | skdev->in_flight >= skdev->queue_low_water_mark) | ||
602 | /* There is still some kind of shortage */ | ||
603 | return; | ||
604 | |||
605 | queue_flag_clear(QUEUE_FLAG_STOPPED, skdev->queue); | ||
606 | } | ||
607 | |||
608 | /* | ||
609 | * Stop conditions: | ||
610 | * - There are no more native requests | ||
611 | * - There are already the maximum number of requests in progress | ||
612 | * - There are no more skd_request_context entries | ||
613 | * - There are no more FIT msg buffers | ||
614 | */ | ||
615 | for (;; ) { | ||
616 | |||
617 | flush = fua = 0; | ||
618 | |||
619 | req = blk_peek_request(q); | ||
620 | |||
621 | /* Are there any native requests to start? */ | ||
622 | if (req == NULL) | ||
623 | break; | ||
624 | |||
625 | lba = (u32)blk_rq_pos(req); | ||
626 | count = blk_rq_sectors(req); | ||
627 | data_dir = rq_data_dir(req); | ||
628 | io_flags = req->cmd_flags; | ||
629 | |||
630 | if (io_flags & REQ_FLUSH) | ||
631 | flush++; | ||
632 | |||
633 | if (io_flags & REQ_FUA) | ||
634 | fua++; | ||
635 | |||
636 | pr_debug("%s:%s:%d new req=%p lba=%u(0x%x) " | ||
637 | "count=%u(0x%x) dir=%d\n", | ||
638 | skdev->name, __func__, __LINE__, | ||
639 | req, lba, lba, count, count, data_dir); | ||
640 | |||
641 | /* At this point we know there is a request */ | ||
642 | |||
643 | /* Are too many requets already in progress? */ | ||
644 | if (skdev->in_flight >= skdev->cur_max_queue_depth) { | ||
645 | pr_debug("%s:%s:%d qdepth %d, limit %d\n", | ||
646 | skdev->name, __func__, __LINE__, | ||
647 | skdev->in_flight, skdev->cur_max_queue_depth); | ||
648 | break; | ||
649 | } | ||
650 | |||
651 | /* Is a skd_request_context available? */ | ||
652 | skreq = skdev->skreq_free_list; | ||
653 | if (skreq == NULL) { | ||
654 | pr_debug("%s:%s:%d Out of req=%p\n", | ||
655 | skdev->name, __func__, __LINE__, q); | ||
656 | break; | ||
657 | } | ||
658 | SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE); | ||
659 | SKD_ASSERT((skreq->id & SKD_ID_INCR) == 0); | ||
660 | |||
661 | /* Now we check to see if we can get a fit msg */ | ||
662 | if (skmsg == NULL) { | ||
663 | if (skdev->skmsg_free_list == NULL) { | ||
664 | pr_debug("%s:%s:%d Out of msg\n", | ||
665 | skdev->name, __func__, __LINE__); | ||
666 | break; | ||
667 | } | ||
668 | } | ||
669 | |||
670 | skreq->flush_cmd = 0; | ||
671 | skreq->n_sg = 0; | ||
672 | skreq->sg_byte_count = 0; | ||
673 | skreq->discard_page = 0; | ||
674 | |||
675 | /* | ||
676 | * OK to now dequeue request from q. | ||
677 | * | ||
678 | * At this point we are comitted to either start or reject | ||
679 | * the native request. Note that skd_request_context is | ||
680 | * available but is still at the head of the free list. | ||
681 | */ | ||
682 | blk_start_request(req); | ||
683 | skreq->req = req; | ||
684 | skreq->fitmsg_id = 0; | ||
685 | |||
686 | /* Either a FIT msg is in progress or we have to start one. */ | ||
687 | if (skmsg == NULL) { | ||
688 | /* Are there any FIT msg buffers available? */ | ||
689 | skmsg = skdev->skmsg_free_list; | ||
690 | if (skmsg == NULL) { | ||
691 | pr_debug("%s:%s:%d Out of msg skdev=%p\n", | ||
692 | skdev->name, __func__, __LINE__, | ||
693 | skdev); | ||
694 | break; | ||
695 | } | ||
696 | SKD_ASSERT(skmsg->state == SKD_MSG_STATE_IDLE); | ||
697 | SKD_ASSERT((skmsg->id & SKD_ID_INCR) == 0); | ||
698 | |||
699 | skdev->skmsg_free_list = skmsg->next; | ||
700 | |||
701 | skmsg->state = SKD_MSG_STATE_BUSY; | ||
702 | skmsg->id += SKD_ID_INCR; | ||
703 | |||
704 | /* Initialize the FIT msg header */ | ||
705 | fmh = (struct fit_msg_hdr *)skmsg->msg_buf; | ||
706 | memset(fmh, 0, sizeof(*fmh)); | ||
707 | fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; | ||
708 | skmsg->length = sizeof(*fmh); | ||
709 | } | ||
710 | |||
711 | skreq->fitmsg_id = skmsg->id; | ||
712 | |||
713 | /* | ||
714 | * Note that a FIT msg may have just been started | ||
715 | * but contains no SoFIT requests yet. | ||
716 | */ | ||
717 | |||
718 | /* | ||
719 | * Transcode the request, checking as we go. The outcome of | ||
720 | * the transcoding is represented by the error variable. | ||
721 | */ | ||
722 | cmd_ptr = &skmsg->msg_buf[skmsg->length]; | ||
723 | memset(cmd_ptr, 0, 32); | ||
724 | |||
725 | be_lba = cpu_to_be32(lba); | ||
726 | be_count = cpu_to_be32(count); | ||
727 | be_dmaa = cpu_to_be64((u64)skreq->sksg_dma_address); | ||
728 | cmdctxt = skreq->id + SKD_ID_INCR; | ||
729 | |||
730 | scsi_req = cmd_ptr; | ||
731 | scsi_req->hdr.tag = cmdctxt; | ||
732 | scsi_req->hdr.sg_list_dma_address = be_dmaa; | ||
733 | |||
734 | if (data_dir == READ) | ||
735 | skreq->sg_data_dir = SKD_DATA_DIR_CARD_TO_HOST; | ||
736 | else | ||
737 | skreq->sg_data_dir = SKD_DATA_DIR_HOST_TO_CARD; | ||
738 | |||
739 | if (io_flags & REQ_DISCARD) { | ||
740 | page = alloc_page(GFP_ATOMIC | __GFP_ZERO); | ||
741 | if (!page) { | ||
742 | pr_err("request_fn:Page allocation failed.\n"); | ||
743 | skd_end_request(skdev, skreq, -ENOMEM); | ||
744 | break; | ||
745 | } | ||
746 | skreq->discard_page = 1; | ||
747 | skd_prep_discard_cdb(scsi_req, skreq, page, lba, count); | ||
748 | |||
749 | } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { | ||
750 | skd_prep_zerosize_flush_cdb(scsi_req, skreq); | ||
751 | SKD_ASSERT(skreq->flush_cmd == 1); | ||
752 | |||
753 | } else { | ||
754 | skd_prep_rw_cdb(scsi_req, data_dir, lba, count); | ||
755 | } | ||
756 | |||
757 | if (fua) | ||
758 | scsi_req->cdb[1] |= SKD_FUA_NV; | ||
759 | |||
760 | if (!req->bio) | ||
761 | goto skip_sg; | ||
762 | |||
763 | error = skd_preop_sg_list(skdev, skreq); | ||
764 | |||
765 | if (error != 0) { | ||
766 | /* | ||
767 | * Complete the native request with error. | ||
768 | * Note that the request context is still at the | ||
769 | * head of the free list, and that the SoFIT request | ||
770 | * was encoded into the FIT msg buffer but the FIT | ||
771 | * msg length has not been updated. In short, the | ||
772 | * only resource that has been allocated but might | ||
773 | * not be used is that the FIT msg could be empty. | ||
774 | */ | ||
775 | pr_debug("%s:%s:%d error Out\n", | ||
776 | skdev->name, __func__, __LINE__); | ||
777 | skd_end_request(skdev, skreq, error); | ||
778 | continue; | ||
779 | } | ||
780 | |||
781 | skip_sg: | ||
782 | scsi_req->hdr.sg_list_len_bytes = | ||
783 | cpu_to_be32(skreq->sg_byte_count); | ||
784 | |||
785 | /* Complete resource allocations. */ | ||
786 | skdev->skreq_free_list = skreq->next; | ||
787 | skreq->state = SKD_REQ_STATE_BUSY; | ||
788 | skreq->id += SKD_ID_INCR; | ||
789 | |||
790 | skmsg->length += sizeof(struct skd_scsi_request); | ||
791 | fmh->num_protocol_cmds_coalesced++; | ||
792 | |||
793 | /* | ||
794 | * Update the active request counts. | ||
795 | * Capture the timeout timestamp. | ||
796 | */ | ||
797 | skreq->timeout_stamp = skdev->timeout_stamp; | ||
798 | timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; | ||
799 | skdev->timeout_slot[timo_slot]++; | ||
800 | skdev->in_flight++; | ||
801 | pr_debug("%s:%s:%d req=0x%x busy=%d\n", | ||
802 | skdev->name, __func__, __LINE__, | ||
803 | skreq->id, skdev->in_flight); | ||
804 | |||
805 | /* | ||
806 | * If the FIT msg buffer is full send it. | ||
807 | */ | ||
808 | if (skmsg->length >= SKD_N_FITMSG_BYTES || | ||
809 | fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) { | ||
810 | skd_send_fitmsg(skdev, skmsg); | ||
811 | skmsg = NULL; | ||
812 | fmh = NULL; | ||
813 | } | ||
814 | } | ||
815 | |||
816 | /* | ||
817 | * Is a FIT msg in progress? If it is empty put the buffer back | ||
818 | * on the free list. If it is non-empty send what we got. | ||
819 | * This minimizes latency when there are fewer requests than | ||
820 | * what fits in a FIT msg. | ||
821 | */ | ||
822 | if (skmsg != NULL) { | ||
823 | /* Bigger than just a FIT msg header? */ | ||
824 | if (skmsg->length > sizeof(struct fit_msg_hdr)) { | ||
825 | pr_debug("%s:%s:%d sending msg=%p, len %d\n", | ||
826 | skdev->name, __func__, __LINE__, | ||
827 | skmsg, skmsg->length); | ||
828 | skd_send_fitmsg(skdev, skmsg); | ||
829 | } else { | ||
830 | /* | ||
831 | * The FIT msg is empty. It means we got started | ||
832 | * on the msg, but the requests were rejected. | ||
833 | */ | ||
834 | skmsg->state = SKD_MSG_STATE_IDLE; | ||
835 | skmsg->id += SKD_ID_INCR; | ||
836 | skmsg->next = skdev->skmsg_free_list; | ||
837 | skdev->skmsg_free_list = skmsg; | ||
838 | } | ||
839 | skmsg = NULL; | ||
840 | fmh = NULL; | ||
841 | } | ||
842 | |||
843 | /* | ||
844 | * If req is non-NULL it means there is something to do but | ||
845 | * we are out of a resource. | ||
846 | */ | ||
847 | if (req) | ||
848 | blk_stop_queue(skdev->queue); | ||
849 | } | ||
850 | |||
851 | static void skd_end_request(struct skd_device *skdev, | ||
852 | struct skd_request_context *skreq, int error) | ||
853 | { | ||
854 | struct request *req = skreq->req; | ||
855 | unsigned int io_flags = req->cmd_flags; | ||
856 | |||
857 | if ((io_flags & REQ_DISCARD) && | ||
858 | (skreq->discard_page == 1)) { | ||
859 | pr_debug("%s:%s:%d, free the page!", | ||
860 | skdev->name, __func__, __LINE__); | ||
861 | free_page((unsigned long)req->buffer); | ||
862 | req->buffer = NULL; | ||
863 | } | ||
864 | |||
865 | if (unlikely(error)) { | ||
866 | struct request *req = skreq->req; | ||
867 | char *cmd = (rq_data_dir(req) == READ) ? "read" : "write"; | ||
868 | u32 lba = (u32)blk_rq_pos(req); | ||
869 | u32 count = blk_rq_sectors(req); | ||
870 | |||
871 | pr_err("(%s): Error cmd=%s sect=%u count=%u id=0x%x\n", | ||
872 | skd_name(skdev), cmd, lba, count, skreq->id); | ||
873 | } else | ||
874 | pr_debug("%s:%s:%d id=0x%x error=%d\n", | ||
875 | skdev->name, __func__, __LINE__, skreq->id, error); | ||
876 | |||
877 | __blk_end_request_all(skreq->req, error); | ||
878 | } | ||
879 | |||
880 | static int skd_preop_sg_list(struct skd_device *skdev, | ||
881 | struct skd_request_context *skreq) | ||
882 | { | ||
883 | struct request *req = skreq->req; | ||
884 | int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD; | ||
885 | int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE; | ||
886 | struct scatterlist *sg = &skreq->sg[0]; | ||
887 | int n_sg; | ||
888 | int i; | ||
889 | |||
890 | skreq->sg_byte_count = 0; | ||
891 | |||
892 | /* SKD_ASSERT(skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD || | ||
893 | skreq->sg_data_dir == SKD_DATA_DIR_CARD_TO_HOST); */ | ||
894 | |||
895 | n_sg = blk_rq_map_sg(skdev->queue, req, sg); | ||
896 | if (n_sg <= 0) | ||
897 | return -EINVAL; | ||
898 | |||
899 | /* | ||
900 | * Map scatterlist to PCI bus addresses. | ||
901 | * Note PCI might change the number of entries. | ||
902 | */ | ||
903 | n_sg = pci_map_sg(skdev->pdev, sg, n_sg, pci_dir); | ||
904 | if (n_sg <= 0) | ||
905 | return -EINVAL; | ||
906 | |||
907 | SKD_ASSERT(n_sg <= skdev->sgs_per_request); | ||
908 | |||
909 | skreq->n_sg = n_sg; | ||
910 | |||
911 | for (i = 0; i < n_sg; i++) { | ||
912 | struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; | ||
913 | u32 cnt = sg_dma_len(&sg[i]); | ||
914 | uint64_t dma_addr = sg_dma_address(&sg[i]); | ||
915 | |||
916 | sgd->control = FIT_SGD_CONTROL_NOT_LAST; | ||
917 | sgd->byte_count = cnt; | ||
918 | skreq->sg_byte_count += cnt; | ||
919 | sgd->host_side_addr = dma_addr; | ||
920 | sgd->dev_side_addr = 0; | ||
921 | } | ||
922 | |||
923 | skreq->sksg_list[n_sg - 1].next_desc_ptr = 0LL; | ||
924 | skreq->sksg_list[n_sg - 1].control = FIT_SGD_CONTROL_LAST; | ||
925 | |||
926 | if (unlikely(skdev->dbg_level > 1)) { | ||
927 | pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n", | ||
928 | skdev->name, __func__, __LINE__, | ||
929 | skreq->id, skreq->sksg_list, skreq->sksg_dma_address); | ||
930 | for (i = 0; i < n_sg; i++) { | ||
931 | struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; | ||
932 | pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x " | ||
933 | "addr=0x%llx next=0x%llx\n", | ||
934 | skdev->name, __func__, __LINE__, | ||
935 | i, sgd->byte_count, sgd->control, | ||
936 | sgd->host_side_addr, sgd->next_desc_ptr); | ||
937 | } | ||
938 | } | ||
939 | |||
940 | return 0; | ||
941 | } | ||
942 | |||
943 | static void skd_postop_sg_list(struct skd_device *skdev, | ||
944 | struct skd_request_context *skreq) | ||
945 | { | ||
946 | int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD; | ||
947 | int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE; | ||
948 | |||
949 | /* | ||
950 | * restore the next ptr for next IO request so we | ||
951 | * don't have to set it every time. | ||
952 | */ | ||
953 | skreq->sksg_list[skreq->n_sg - 1].next_desc_ptr = | ||
954 | skreq->sksg_dma_address + | ||
955 | ((skreq->n_sg) * sizeof(struct fit_sg_descriptor)); | ||
956 | pci_unmap_sg(skdev->pdev, &skreq->sg[0], skreq->n_sg, pci_dir); | ||
957 | } | ||
958 | |||
959 | static void skd_request_fn_not_online(struct request_queue *q) | ||
960 | { | ||
961 | struct skd_device *skdev = q->queuedata; | ||
962 | int error; | ||
963 | |||
964 | SKD_ASSERT(skdev->state != SKD_DRVR_STATE_ONLINE); | ||
965 | |||
966 | skd_log_skdev(skdev, "req_not_online"); | ||
967 | switch (skdev->state) { | ||
968 | case SKD_DRVR_STATE_PAUSING: | ||
969 | case SKD_DRVR_STATE_PAUSED: | ||
970 | case SKD_DRVR_STATE_STARTING: | ||
971 | case SKD_DRVR_STATE_RESTARTING: | ||
972 | case SKD_DRVR_STATE_WAIT_BOOT: | ||
973 | /* In case of starting, we haven't started the queue, | ||
974 | * so we can't get here... but requests are | ||
975 | * possibly hanging out waiting for us because we | ||
976 | * reported the dev/skd0 already. They'll wait | ||
977 | * forever if connect doesn't complete. | ||
978 | * What to do??? delay dev/skd0 ?? | ||
979 | */ | ||
980 | case SKD_DRVR_STATE_BUSY: | ||
981 | case SKD_DRVR_STATE_BUSY_IMMINENT: | ||
982 | case SKD_DRVR_STATE_BUSY_ERASE: | ||
983 | case SKD_DRVR_STATE_DRAINING_TIMEOUT: | ||
984 | return; | ||
985 | |||
986 | case SKD_DRVR_STATE_BUSY_SANITIZE: | ||
987 | case SKD_DRVR_STATE_STOPPING: | ||
988 | case SKD_DRVR_STATE_SYNCING: | ||
989 | case SKD_DRVR_STATE_FAULT: | ||
990 | case SKD_DRVR_STATE_DISAPPEARED: | ||
991 | default: | ||
992 | error = -EIO; | ||
993 | break; | ||
994 | } | ||
995 | |||
996 | /* If we get here, terminate all pending block requeusts | ||
997 | * with EIO and any scsi pass thru with appropriate sense | ||
998 | */ | ||
999 | |||
1000 | skd_fail_all_pending(skdev); | ||
1001 | } | ||
1002 | |||
1003 | /* | ||
1004 | ***************************************************************************** | ||
1005 | * TIMER | ||
1006 | ***************************************************************************** | ||
1007 | */ | ||
1008 | |||
1009 | static void skd_timer_tick_not_online(struct skd_device *skdev); | ||
1010 | |||
1011 | static void skd_timer_tick(ulong arg) | ||
1012 | { | ||
1013 | struct skd_device *skdev = (struct skd_device *)arg; | ||
1014 | |||
1015 | u32 timo_slot; | ||
1016 | u32 overdue_timestamp; | ||
1017 | unsigned long reqflags; | ||
1018 | u32 state; | ||
1019 | |||
1020 | if (skdev->state == SKD_DRVR_STATE_FAULT) | ||
1021 | /* The driver has declared fault, and we want it to | ||
1022 | * stay that way until driver is reloaded. | ||
1023 | */ | ||
1024 | return; | ||
1025 | |||
1026 | spin_lock_irqsave(&skdev->lock, reqflags); | ||
1027 | |||
1028 | state = SKD_READL(skdev, FIT_STATUS); | ||
1029 | state &= FIT_SR_DRIVE_STATE_MASK; | ||
1030 | if (state != skdev->drive_state) | ||
1031 | skd_isr_fwstate(skdev); | ||
1032 | |||
1033 | if (skdev->state != SKD_DRVR_STATE_ONLINE) { | ||
1034 | skd_timer_tick_not_online(skdev); | ||
1035 | goto timer_func_out; | ||
1036 | } | ||
1037 | skdev->timeout_stamp++; | ||
1038 | timo_slot = skdev->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; | ||
1039 | |||
1040 | /* | ||
1041 | * All requests that happened during the previous use of | ||
1042 | * this slot should be done by now. The previous use was | ||
1043 | * over 7 seconds ago. | ||
1044 | */ | ||
1045 | if (skdev->timeout_slot[timo_slot] == 0) | ||
1046 | goto timer_func_out; | ||
1047 | |||
1048 | /* Something is overdue */ | ||
1049 | overdue_timestamp = skdev->timeout_stamp - SKD_N_TIMEOUT_SLOT; | ||
1050 | |||
1051 | pr_debug("%s:%s:%d found %d timeouts, draining busy=%d\n", | ||
1052 | skdev->name, __func__, __LINE__, | ||
1053 | skdev->timeout_slot[timo_slot], skdev->in_flight); | ||
1054 | pr_err("(%s): Overdue IOs (%d), busy %d\n", | ||
1055 | skd_name(skdev), skdev->timeout_slot[timo_slot], | ||
1056 | skdev->in_flight); | ||
1057 | |||
1058 | skdev->timer_countdown = SKD_DRAINING_TIMO; | ||
1059 | skdev->state = SKD_DRVR_STATE_DRAINING_TIMEOUT; | ||
1060 | skdev->timo_slot = timo_slot; | ||
1061 | blk_stop_queue(skdev->queue); | ||
1062 | |||
1063 | timer_func_out: | ||
1064 | mod_timer(&skdev->timer, (jiffies + HZ)); | ||
1065 | |||
1066 | spin_unlock_irqrestore(&skdev->lock, reqflags); | ||
1067 | } | ||
1068 | |||
1069 | static void skd_timer_tick_not_online(struct skd_device *skdev) | ||
1070 | { | ||
1071 | switch (skdev->state) { | ||
1072 | case SKD_DRVR_STATE_IDLE: | ||
1073 | case SKD_DRVR_STATE_LOAD: | ||
1074 | break; | ||
1075 | case SKD_DRVR_STATE_BUSY_SANITIZE: | ||
1076 | pr_debug("%s:%s:%d drive busy sanitize[%x], driver[%x]\n", | ||
1077 | skdev->name, __func__, __LINE__, | ||
1078 | skdev->drive_state, skdev->state); | ||
1079 | /* If we've been in sanitize for 3 seconds, we figure we're not | ||
1080 | * going to get anymore completions, so recover requests now | ||
1081 | */ | ||
1082 | if (skdev->timer_countdown > 0) { | ||
1083 | skdev->timer_countdown--; | ||
1084 | return; | ||
1085 | } | ||
1086 | skd_recover_requests(skdev, 0); | ||
1087 | break; | ||
1088 | |||
1089 | case SKD_DRVR_STATE_BUSY: | ||
1090 | case SKD_DRVR_STATE_BUSY_IMMINENT: | ||
1091 | case SKD_DRVR_STATE_BUSY_ERASE: | ||
1092 | pr_debug("%s:%s:%d busy[%x], countdown=%d\n", | ||
1093 | skdev->name, __func__, __LINE__, | ||
1094 | skdev->state, skdev->timer_countdown); | ||
1095 | if (skdev->timer_countdown > 0) { | ||
1096 | skdev->timer_countdown--; | ||
1097 | return; | ||
1098 | } | ||
1099 | pr_debug("%s:%s:%d busy[%x], timedout=%d, restarting device.", | ||
1100 | skdev->name, __func__, __LINE__, | ||
1101 | skdev->state, skdev->timer_countdown); | ||
1102 | skd_restart_device(skdev); | ||
1103 | break; | ||
1104 | |||
1105 | case SKD_DRVR_STATE_WAIT_BOOT: | ||
1106 | case SKD_DRVR_STATE_STARTING: | ||
1107 | if (skdev->timer_countdown > 0) { | ||
1108 | skdev->timer_countdown--; | ||
1109 | return; | ||
1110 | } | ||
1111 | /* For now, we fault the drive. Could attempt resets to | ||
1112 | * revcover at some point. */ | ||
1113 | skdev->state = SKD_DRVR_STATE_FAULT; | ||
1114 | |||
1115 | pr_err("(%s): DriveFault Connect Timeout (%x)\n", | ||
1116 | skd_name(skdev), skdev->drive_state); | ||
1117 | |||
1118 | /*start the queue so we can respond with error to requests */ | ||
1119 | /* wakeup anyone waiting for startup complete */ | ||
1120 | blk_start_queue(skdev->queue); | ||
1121 | skdev->gendisk_on = -1; | ||
1122 | wake_up_interruptible(&skdev->waitq); | ||
1123 | break; | ||
1124 | |||
1125 | case SKD_DRVR_STATE_ONLINE: | ||
1126 | /* shouldn't get here. */ | ||
1127 | break; | ||
1128 | |||
1129 | case SKD_DRVR_STATE_PAUSING: | ||
1130 | case SKD_DRVR_STATE_PAUSED: | ||
1131 | break; | ||
1132 | |||
1133 | case SKD_DRVR_STATE_DRAINING_TIMEOUT: | ||
1134 | pr_debug("%s:%s:%d " | ||
1135 | "draining busy [%d] tick[%d] qdb[%d] tmls[%d]\n", | ||
1136 | skdev->name, __func__, __LINE__, | ||
1137 | skdev->timo_slot, | ||
1138 | skdev->timer_countdown, | ||
1139 | skdev->in_flight, | ||
1140 | skdev->timeout_slot[skdev->timo_slot]); | ||
1141 | /* if the slot has cleared we can let the I/O continue */ | ||
1142 | if (skdev->timeout_slot[skdev->timo_slot] == 0) { | ||
1143 | pr_debug("%s:%s:%d Slot drained, starting queue.\n", | ||
1144 | skdev->name, __func__, __LINE__); | ||
1145 | skdev->state = SKD_DRVR_STATE_ONLINE; | ||
1146 | blk_start_queue(skdev->queue); | ||
1147 | return; | ||
1148 | } | ||
1149 | if (skdev->timer_countdown > 0) { | ||
1150 | skdev->timer_countdown--; | ||
1151 | return; | ||
1152 | } | ||
1153 | skd_restart_device(skdev); | ||
1154 | break; | ||
1155 | |||
1156 | case SKD_DRVR_STATE_RESTARTING: | ||
1157 | if (skdev->timer_countdown > 0) { | ||
1158 | skdev->timer_countdown--; | ||
1159 | return; | ||
1160 | } | ||
1161 | /* For now, we fault the drive. Could attempt resets to | ||
1162 | * revcover at some point. */ | ||
1163 | skdev->state = SKD_DRVR_STATE_FAULT; | ||
1164 | pr_err("(%s): DriveFault Reconnect Timeout (%x)\n", | ||
1165 | skd_name(skdev), skdev->drive_state); | ||
1166 | |||
1167 | /* | ||
1168 | * Recovering does two things: | ||
1169 | * 1. completes IO with error | ||
1170 | * 2. reclaims dma resources | ||
1171 | * When is it safe to recover requests? | ||
1172 | * - if the drive state is faulted | ||
1173 | * - if the state is still soft reset after out timeout | ||
1174 | * - if the drive registers are dead (state = FF) | ||
1175 | * If it is "unsafe", we still need to recover, so we will | ||
1176 | * disable pci bus mastering and disable our interrupts. | ||
1177 | */ | ||
1178 | |||
1179 | if ((skdev->drive_state == FIT_SR_DRIVE_SOFT_RESET) || | ||
1180 | (skdev->drive_state == FIT_SR_DRIVE_FAULT) || | ||
1181 | (skdev->drive_state == FIT_SR_DRIVE_STATE_MASK)) | ||
1182 | /* It never came out of soft reset. Try to | ||
1183 | * recover the requests and then let them | ||
1184 | * fail. This is to mitigate hung processes. */ | ||
1185 | skd_recover_requests(skdev, 0); | ||
1186 | else { | ||
1187 | pr_err("(%s): Disable BusMaster (%x)\n", | ||
1188 | skd_name(skdev), skdev->drive_state); | ||
1189 | pci_disable_device(skdev->pdev); | ||
1190 | skd_disable_interrupts(skdev); | ||
1191 | skd_recover_requests(skdev, 0); | ||
1192 | } | ||
1193 | |||
1194 | /*start the queue so we can respond with error to requests */ | ||
1195 | /* wakeup anyone waiting for startup complete */ | ||
1196 | blk_start_queue(skdev->queue); | ||
1197 | skdev->gendisk_on = -1; | ||
1198 | wake_up_interruptible(&skdev->waitq); | ||
1199 | break; | ||
1200 | |||
1201 | case SKD_DRVR_STATE_RESUMING: | ||
1202 | case SKD_DRVR_STATE_STOPPING: | ||
1203 | case SKD_DRVR_STATE_SYNCING: | ||
1204 | case SKD_DRVR_STATE_FAULT: | ||
1205 | case SKD_DRVR_STATE_DISAPPEARED: | ||
1206 | default: | ||
1207 | break; | ||
1208 | } | ||
1209 | } | ||
1210 | |||
1211 | static int skd_start_timer(struct skd_device *skdev) | ||
1212 | { | ||
1213 | int rc; | ||
1214 | |||
1215 | init_timer(&skdev->timer); | ||
1216 | setup_timer(&skdev->timer, skd_timer_tick, (ulong)skdev); | ||
1217 | |||
1218 | rc = mod_timer(&skdev->timer, (jiffies + HZ)); | ||
1219 | if (rc) | ||
1220 | pr_err("%s: failed to start timer %d\n", | ||
1221 | __func__, rc); | ||
1222 | return rc; | ||
1223 | } | ||
1224 | |||
1225 | static void skd_kill_timer(struct skd_device *skdev) | ||
1226 | { | ||
1227 | del_timer_sync(&skdev->timer); | ||
1228 | } | ||
1229 | |||
1230 | /* | ||
1231 | ***************************************************************************** | ||
1232 | * IOCTL | ||
1233 | ***************************************************************************** | ||
1234 | */ | ||
1235 | static int skd_ioctl_sg_io(struct skd_device *skdev, | ||
1236 | fmode_t mode, void __user *argp); | ||
1237 | static int skd_sg_io_get_and_check_args(struct skd_device *skdev, | ||
1238 | struct skd_sg_io *sksgio); | ||
1239 | static int skd_sg_io_obtain_skspcl(struct skd_device *skdev, | ||
1240 | struct skd_sg_io *sksgio); | ||
1241 | static int skd_sg_io_prep_buffering(struct skd_device *skdev, | ||
1242 | struct skd_sg_io *sksgio); | ||
1243 | static int skd_sg_io_copy_buffer(struct skd_device *skdev, | ||
1244 | struct skd_sg_io *sksgio, int dxfer_dir); | ||
1245 | static int skd_sg_io_send_fitmsg(struct skd_device *skdev, | ||
1246 | struct skd_sg_io *sksgio); | ||
1247 | static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio); | ||
1248 | static int skd_sg_io_release_skspcl(struct skd_device *skdev, | ||
1249 | struct skd_sg_io *sksgio); | ||
1250 | static int skd_sg_io_put_status(struct skd_device *skdev, | ||
1251 | struct skd_sg_io *sksgio); | ||
1252 | |||
1253 | static void skd_complete_special(struct skd_device *skdev, | ||
1254 | volatile struct fit_completion_entry_v1 | ||
1255 | *skcomp, | ||
1256 | volatile struct fit_comp_error_info *skerr, | ||
1257 | struct skd_special_context *skspcl); | ||
1258 | |||
1259 | static int skd_bdev_ioctl(struct block_device *bdev, fmode_t mode, | ||
1260 | uint cmd_in, ulong arg) | ||
1261 | { | ||
1262 | int rc = 0; | ||
1263 | struct gendisk *disk = bdev->bd_disk; | ||
1264 | struct skd_device *skdev = disk->private_data; | ||
1265 | void __user *p = (void *)arg; | ||
1266 | |||
1267 | pr_debug("%s:%s:%d %s: CMD[%s] ioctl mode 0x%x, cmd 0x%x arg %0lx\n", | ||
1268 | skdev->name, __func__, __LINE__, | ||
1269 | disk->disk_name, current->comm, mode, cmd_in, arg); | ||
1270 | |||
1271 | if (!capable(CAP_SYS_ADMIN)) | ||
1272 | return -EPERM; | ||
1273 | |||
1274 | switch (cmd_in) { | ||
1275 | case SG_SET_TIMEOUT: | ||
1276 | case SG_GET_TIMEOUT: | ||
1277 | case SG_GET_VERSION_NUM: | ||
1278 | rc = scsi_cmd_ioctl(disk->queue, disk, mode, cmd_in, p); | ||
1279 | break; | ||
1280 | case SG_IO: | ||
1281 | rc = skd_ioctl_sg_io(skdev, mode, p); | ||
1282 | break; | ||
1283 | |||
1284 | default: | ||
1285 | rc = -ENOTTY; | ||
1286 | break; | ||
1287 | } | ||
1288 | |||
1289 | pr_debug("%s:%s:%d %s: completion rc %d\n", | ||
1290 | skdev->name, __func__, __LINE__, disk->disk_name, rc); | ||
1291 | return rc; | ||
1292 | } | ||
1293 | |||
1294 | static int skd_ioctl_sg_io(struct skd_device *skdev, fmode_t mode, | ||
1295 | void __user *argp) | ||
1296 | { | ||
1297 | int rc; | ||
1298 | struct skd_sg_io sksgio; | ||
1299 | |||
1300 | memset(&sksgio, 0, sizeof(sksgio)); | ||
1301 | sksgio.mode = mode; | ||
1302 | sksgio.argp = argp; | ||
1303 | sksgio.iov = &sksgio.no_iov_iov; | ||
1304 | |||
1305 | switch (skdev->state) { | ||
1306 | case SKD_DRVR_STATE_ONLINE: | ||
1307 | case SKD_DRVR_STATE_BUSY_IMMINENT: | ||
1308 | break; | ||
1309 | |||
1310 | default: | ||
1311 | pr_debug("%s:%s:%d drive not online\n", | ||
1312 | skdev->name, __func__, __LINE__); | ||
1313 | rc = -ENXIO; | ||
1314 | goto out; | ||
1315 | } | ||
1316 | |||
1317 | rc = skd_sg_io_get_and_check_args(skdev, &sksgio); | ||
1318 | if (rc) | ||
1319 | goto out; | ||
1320 | |||
1321 | rc = skd_sg_io_obtain_skspcl(skdev, &sksgio); | ||
1322 | if (rc) | ||
1323 | goto out; | ||
1324 | |||
1325 | rc = skd_sg_io_prep_buffering(skdev, &sksgio); | ||
1326 | if (rc) | ||
1327 | goto out; | ||
1328 | |||
1329 | rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_TO_DEV); | ||
1330 | if (rc) | ||
1331 | goto out; | ||
1332 | |||
1333 | rc = skd_sg_io_send_fitmsg(skdev, &sksgio); | ||
1334 | if (rc) | ||
1335 | goto out; | ||
1336 | |||
1337 | rc = skd_sg_io_await(skdev, &sksgio); | ||
1338 | if (rc) | ||
1339 | goto out; | ||
1340 | |||
1341 | rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_FROM_DEV); | ||
1342 | if (rc) | ||
1343 | goto out; | ||
1344 | |||
1345 | rc = skd_sg_io_put_status(skdev, &sksgio); | ||
1346 | if (rc) | ||
1347 | goto out; | ||
1348 | |||
1349 | rc = 0; | ||
1350 | |||
1351 | out: | ||
1352 | skd_sg_io_release_skspcl(skdev, &sksgio); | ||
1353 | |||
1354 | if (sksgio.iov != NULL && sksgio.iov != &sksgio.no_iov_iov) | ||
1355 | kfree(sksgio.iov); | ||
1356 | return rc; | ||
1357 | } | ||
1358 | |||
1359 | static int skd_sg_io_get_and_check_args(struct skd_device *skdev, | ||
1360 | struct skd_sg_io *sksgio) | ||
1361 | { | ||
1362 | struct sg_io_hdr *sgp = &sksgio->sg; | ||
1363 | int i, acc; | ||
1364 | |||
1365 | if (!access_ok(VERIFY_WRITE, sksgio->argp, sizeof(sg_io_hdr_t))) { | ||
1366 | pr_debug("%s:%s:%d access sg failed %p\n", | ||
1367 | skdev->name, __func__, __LINE__, sksgio->argp); | ||
1368 | return -EFAULT; | ||
1369 | } | ||
1370 | |||
1371 | if (__copy_from_user(sgp, sksgio->argp, sizeof(sg_io_hdr_t))) { | ||
1372 | pr_debug("%s:%s:%d copy_from_user sg failed %p\n", | ||
1373 | skdev->name, __func__, __LINE__, sksgio->argp); | ||
1374 | return -EFAULT; | ||
1375 | } | ||
1376 | |||
1377 | if (sgp->interface_id != SG_INTERFACE_ID_ORIG) { | ||
1378 | pr_debug("%s:%s:%d interface_id invalid 0x%x\n", | ||
1379 | skdev->name, __func__, __LINE__, sgp->interface_id); | ||
1380 | return -EINVAL; | ||
1381 | } | ||
1382 | |||
1383 | if (sgp->cmd_len > sizeof(sksgio->cdb)) { | ||
1384 | pr_debug("%s:%s:%d cmd_len invalid %d\n", | ||
1385 | skdev->name, __func__, __LINE__, sgp->cmd_len); | ||
1386 | return -EINVAL; | ||
1387 | } | ||
1388 | |||
1389 | if (sgp->iovec_count > 256) { | ||
1390 | pr_debug("%s:%s:%d iovec_count invalid %d\n", | ||
1391 | skdev->name, __func__, __LINE__, sgp->iovec_count); | ||
1392 | return -EINVAL; | ||
1393 | } | ||
1394 | |||
1395 | if (sgp->dxfer_len > (PAGE_SIZE * SKD_N_SG_PER_SPECIAL)) { | ||
1396 | pr_debug("%s:%s:%d dxfer_len invalid %d\n", | ||
1397 | skdev->name, __func__, __LINE__, sgp->dxfer_len); | ||
1398 | return -EINVAL; | ||
1399 | } | ||
1400 | |||
1401 | switch (sgp->dxfer_direction) { | ||
1402 | case SG_DXFER_NONE: | ||
1403 | acc = -1; | ||
1404 | break; | ||
1405 | |||
1406 | case SG_DXFER_TO_DEV: | ||
1407 | acc = VERIFY_READ; | ||
1408 | break; | ||
1409 | |||
1410 | case SG_DXFER_FROM_DEV: | ||
1411 | case SG_DXFER_TO_FROM_DEV: | ||
1412 | acc = VERIFY_WRITE; | ||
1413 | break; | ||
1414 | |||
1415 | default: | ||
1416 | pr_debug("%s:%s:%d dxfer_dir invalid %d\n", | ||
1417 | skdev->name, __func__, __LINE__, sgp->dxfer_direction); | ||
1418 | return -EINVAL; | ||
1419 | } | ||
1420 | |||
1421 | if (copy_from_user(sksgio->cdb, sgp->cmdp, sgp->cmd_len)) { | ||
1422 | pr_debug("%s:%s:%d copy_from_user cmdp failed %p\n", | ||
1423 | skdev->name, __func__, __LINE__, sgp->cmdp); | ||
1424 | return -EFAULT; | ||
1425 | } | ||
1426 | |||
1427 | if (sgp->mx_sb_len != 0) { | ||
1428 | if (!access_ok(VERIFY_WRITE, sgp->sbp, sgp->mx_sb_len)) { | ||
1429 | pr_debug("%s:%s:%d access sbp failed %p\n", | ||
1430 | skdev->name, __func__, __LINE__, sgp->sbp); | ||
1431 | return -EFAULT; | ||
1432 | } | ||
1433 | } | ||
1434 | |||
1435 | if (sgp->iovec_count == 0) { | ||
1436 | sksgio->iov[0].iov_base = sgp->dxferp; | ||
1437 | sksgio->iov[0].iov_len = sgp->dxfer_len; | ||
1438 | sksgio->iovcnt = 1; | ||
1439 | sksgio->dxfer_len = sgp->dxfer_len; | ||
1440 | } else { | ||
1441 | struct sg_iovec *iov; | ||
1442 | uint nbytes = sizeof(*iov) * sgp->iovec_count; | ||
1443 | size_t iov_data_len; | ||
1444 | |||
1445 | iov = kmalloc(nbytes, GFP_KERNEL); | ||
1446 | if (iov == NULL) { | ||
1447 | pr_debug("%s:%s:%d alloc iovec failed %d\n", | ||
1448 | skdev->name, __func__, __LINE__, | ||
1449 | sgp->iovec_count); | ||
1450 | return -ENOMEM; | ||
1451 | } | ||
1452 | sksgio->iov = iov; | ||
1453 | sksgio->iovcnt = sgp->iovec_count; | ||
1454 | |||
1455 | if (copy_from_user(iov, sgp->dxferp, nbytes)) { | ||
1456 | pr_debug("%s:%s:%d copy_from_user iovec failed %p\n", | ||
1457 | skdev->name, __func__, __LINE__, sgp->dxferp); | ||
1458 | return -EFAULT; | ||
1459 | } | ||
1460 | |||
1461 | /* | ||
1462 | * Sum up the vecs, making sure they don't overflow | ||
1463 | */ | ||
1464 | iov_data_len = 0; | ||
1465 | for (i = 0; i < sgp->iovec_count; i++) { | ||
1466 | if (iov_data_len + iov[i].iov_len < iov_data_len) | ||
1467 | return -EINVAL; | ||
1468 | iov_data_len += iov[i].iov_len; | ||
1469 | } | ||
1470 | |||
1471 | /* SG_IO howto says that the shorter of the two wins */ | ||
1472 | if (sgp->dxfer_len < iov_data_len) { | ||
1473 | sksgio->iovcnt = iov_shorten((struct iovec *)iov, | ||
1474 | sgp->iovec_count, | ||
1475 | sgp->dxfer_len); | ||
1476 | sksgio->dxfer_len = sgp->dxfer_len; | ||
1477 | } else | ||
1478 | sksgio->dxfer_len = iov_data_len; | ||
1479 | } | ||
1480 | |||
1481 | if (sgp->dxfer_direction != SG_DXFER_NONE) { | ||
1482 | struct sg_iovec *iov = sksgio->iov; | ||
1483 | for (i = 0; i < sksgio->iovcnt; i++, iov++) { | ||
1484 | if (!access_ok(acc, iov->iov_base, iov->iov_len)) { | ||
1485 | pr_debug("%s:%s:%d access data failed %p/%d\n", | ||
1486 | skdev->name, __func__, __LINE__, | ||
1487 | iov->iov_base, (int)iov->iov_len); | ||
1488 | return -EFAULT; | ||
1489 | } | ||
1490 | } | ||
1491 | } | ||
1492 | |||
1493 | return 0; | ||
1494 | } | ||
1495 | |||
1496 | static int skd_sg_io_obtain_skspcl(struct skd_device *skdev, | ||
1497 | struct skd_sg_io *sksgio) | ||
1498 | { | ||
1499 | struct skd_special_context *skspcl = NULL; | ||
1500 | int rc; | ||
1501 | |||
1502 | for (;;) { | ||
1503 | ulong flags; | ||
1504 | |||
1505 | spin_lock_irqsave(&skdev->lock, flags); | ||
1506 | skspcl = skdev->skspcl_free_list; | ||
1507 | if (skspcl != NULL) { | ||
1508 | skdev->skspcl_free_list = | ||
1509 | (struct skd_special_context *)skspcl->req.next; | ||
1510 | skspcl->req.id += SKD_ID_INCR; | ||
1511 | skspcl->req.state = SKD_REQ_STATE_SETUP; | ||
1512 | skspcl->orphaned = 0; | ||
1513 | skspcl->req.n_sg = 0; | ||
1514 | } | ||
1515 | spin_unlock_irqrestore(&skdev->lock, flags); | ||
1516 | |||
1517 | if (skspcl != NULL) { | ||
1518 | rc = 0; | ||
1519 | break; | ||
1520 | } | ||
1521 | |||
1522 | pr_debug("%s:%s:%d blocking\n", | ||
1523 | skdev->name, __func__, __LINE__); | ||
1524 | |||
1525 | rc = wait_event_interruptible_timeout( | ||
1526 | skdev->waitq, | ||
1527 | (skdev->skspcl_free_list != NULL), | ||
1528 | msecs_to_jiffies(sksgio->sg.timeout)); | ||
1529 | |||
1530 | pr_debug("%s:%s:%d unblocking, rc=%d\n", | ||
1531 | skdev->name, __func__, __LINE__, rc); | ||
1532 | |||
1533 | if (rc <= 0) { | ||
1534 | if (rc == 0) | ||
1535 | rc = -ETIMEDOUT; | ||
1536 | else | ||
1537 | rc = -EINTR; | ||
1538 | break; | ||
1539 | } | ||
1540 | /* | ||
1541 | * If we get here rc > 0 meaning the timeout to | ||
1542 | * wait_event_interruptible_timeout() had time left, hence the | ||
1543 | * sought event -- non-empty free list -- happened. | ||
1544 | * Retry the allocation. | ||
1545 | */ | ||
1546 | } | ||
1547 | sksgio->skspcl = skspcl; | ||
1548 | |||
1549 | return rc; | ||
1550 | } | ||
1551 | |||
1552 | static int skd_skreq_prep_buffering(struct skd_device *skdev, | ||
1553 | struct skd_request_context *skreq, | ||
1554 | u32 dxfer_len) | ||
1555 | { | ||
1556 | u32 resid = dxfer_len; | ||
1557 | |||
1558 | /* | ||
1559 | * The DMA engine must have aligned addresses and byte counts. | ||
1560 | */ | ||
1561 | resid += (-resid) & 3; | ||
1562 | skreq->sg_byte_count = resid; | ||
1563 | |||
1564 | skreq->n_sg = 0; | ||
1565 | |||
1566 | while (resid > 0) { | ||
1567 | u32 nbytes = PAGE_SIZE; | ||
1568 | u32 ix = skreq->n_sg; | ||
1569 | struct scatterlist *sg = &skreq->sg[ix]; | ||
1570 | struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix]; | ||
1571 | struct page *page; | ||
1572 | |||
1573 | if (nbytes > resid) | ||
1574 | nbytes = resid; | ||
1575 | |||
1576 | page = alloc_page(GFP_KERNEL); | ||
1577 | if (page == NULL) | ||
1578 | return -ENOMEM; | ||
1579 | |||
1580 | sg_set_page(sg, page, nbytes, 0); | ||
1581 | |||
1582 | /* TODO: This should be going through a pci_???() | ||
1583 | * routine to do proper mapping. */ | ||
1584 | sksg->control = FIT_SGD_CONTROL_NOT_LAST; | ||
1585 | sksg->byte_count = nbytes; | ||
1586 | |||
1587 | sksg->host_side_addr = sg_phys(sg); | ||
1588 | |||
1589 | sksg->dev_side_addr = 0; | ||
1590 | sksg->next_desc_ptr = skreq->sksg_dma_address + | ||
1591 | (ix + 1) * sizeof(*sksg); | ||
1592 | |||
1593 | skreq->n_sg++; | ||
1594 | resid -= nbytes; | ||
1595 | } | ||
1596 | |||
1597 | if (skreq->n_sg > 0) { | ||
1598 | u32 ix = skreq->n_sg - 1; | ||
1599 | struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix]; | ||
1600 | |||
1601 | sksg->control = FIT_SGD_CONTROL_LAST; | ||
1602 | sksg->next_desc_ptr = 0; | ||
1603 | } | ||
1604 | |||
1605 | if (unlikely(skdev->dbg_level > 1)) { | ||
1606 | u32 i; | ||
1607 | |||
1608 | pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n", | ||
1609 | skdev->name, __func__, __LINE__, | ||
1610 | skreq->id, skreq->sksg_list, skreq->sksg_dma_address); | ||
1611 | for (i = 0; i < skreq->n_sg; i++) { | ||
1612 | struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; | ||
1613 | |||
1614 | pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x " | ||
1615 | "addr=0x%llx next=0x%llx\n", | ||
1616 | skdev->name, __func__, __LINE__, | ||
1617 | i, sgd->byte_count, sgd->control, | ||
1618 | sgd->host_side_addr, sgd->next_desc_ptr); | ||
1619 | } | ||
1620 | } | ||
1621 | |||
1622 | return 0; | ||
1623 | } | ||
1624 | |||
1625 | static int skd_sg_io_prep_buffering(struct skd_device *skdev, | ||
1626 | struct skd_sg_io *sksgio) | ||
1627 | { | ||
1628 | struct skd_special_context *skspcl = sksgio->skspcl; | ||
1629 | struct skd_request_context *skreq = &skspcl->req; | ||
1630 | u32 dxfer_len = sksgio->dxfer_len; | ||
1631 | int rc; | ||
1632 | |||
1633 | rc = skd_skreq_prep_buffering(skdev, skreq, dxfer_len); | ||
1634 | /* | ||
1635 | * Eventually, errors or not, skd_release_special() is called | ||
1636 | * to recover allocations including partial allocations. | ||
1637 | */ | ||
1638 | return rc; | ||
1639 | } | ||
1640 | |||
1641 | static int skd_sg_io_copy_buffer(struct skd_device *skdev, | ||
1642 | struct skd_sg_io *sksgio, int dxfer_dir) | ||
1643 | { | ||
1644 | struct skd_special_context *skspcl = sksgio->skspcl; | ||
1645 | u32 iov_ix = 0; | ||
1646 | struct sg_iovec curiov; | ||
1647 | u32 sksg_ix = 0; | ||
1648 | u8 *bufp = NULL; | ||
1649 | u32 buf_len = 0; | ||
1650 | u32 resid = sksgio->dxfer_len; | ||
1651 | int rc; | ||
1652 | |||
1653 | curiov.iov_len = 0; | ||
1654 | curiov.iov_base = NULL; | ||
1655 | |||
1656 | if (dxfer_dir != sksgio->sg.dxfer_direction) { | ||
1657 | if (dxfer_dir != SG_DXFER_TO_DEV || | ||
1658 | sksgio->sg.dxfer_direction != SG_DXFER_TO_FROM_DEV) | ||
1659 | return 0; | ||
1660 | } | ||
1661 | |||
1662 | while (resid > 0) { | ||
1663 | u32 nbytes = PAGE_SIZE; | ||
1664 | |||
1665 | if (curiov.iov_len == 0) { | ||
1666 | curiov = sksgio->iov[iov_ix++]; | ||
1667 | continue; | ||
1668 | } | ||
1669 | |||
1670 | if (buf_len == 0) { | ||
1671 | struct page *page; | ||
1672 | page = sg_page(&skspcl->req.sg[sksg_ix++]); | ||
1673 | bufp = page_address(page); | ||
1674 | buf_len = PAGE_SIZE; | ||
1675 | } | ||
1676 | |||
1677 | nbytes = min_t(u32, nbytes, resid); | ||
1678 | nbytes = min_t(u32, nbytes, curiov.iov_len); | ||
1679 | nbytes = min_t(u32, nbytes, buf_len); | ||
1680 | |||
1681 | if (dxfer_dir == SG_DXFER_TO_DEV) | ||
1682 | rc = __copy_from_user(bufp, curiov.iov_base, nbytes); | ||
1683 | else | ||
1684 | rc = __copy_to_user(curiov.iov_base, bufp, nbytes); | ||
1685 | |||
1686 | if (rc) | ||
1687 | return -EFAULT; | ||
1688 | |||
1689 | resid -= nbytes; | ||
1690 | curiov.iov_len -= nbytes; | ||
1691 | curiov.iov_base += nbytes; | ||
1692 | buf_len -= nbytes; | ||
1693 | } | ||
1694 | |||
1695 | return 0; | ||
1696 | } | ||
1697 | |||
1698 | static int skd_sg_io_send_fitmsg(struct skd_device *skdev, | ||
1699 | struct skd_sg_io *sksgio) | ||
1700 | { | ||
1701 | struct skd_special_context *skspcl = sksgio->skspcl; | ||
1702 | struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf; | ||
1703 | struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1]; | ||
1704 | |||
1705 | memset(skspcl->msg_buf, 0, SKD_N_SPECIAL_FITMSG_BYTES); | ||
1706 | |||
1707 | /* Initialize the FIT msg header */ | ||
1708 | fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; | ||
1709 | fmh->num_protocol_cmds_coalesced = 1; | ||
1710 | |||
1711 | /* Initialize the SCSI request */ | ||
1712 | if (sksgio->sg.dxfer_direction != SG_DXFER_NONE) | ||
1713 | scsi_req->hdr.sg_list_dma_address = | ||
1714 | cpu_to_be64(skspcl->req.sksg_dma_address); | ||
1715 | scsi_req->hdr.tag = skspcl->req.id; | ||
1716 | scsi_req->hdr.sg_list_len_bytes = | ||
1717 | cpu_to_be32(skspcl->req.sg_byte_count); | ||
1718 | memcpy(scsi_req->cdb, sksgio->cdb, sizeof(scsi_req->cdb)); | ||
1719 | |||
1720 | skspcl->req.state = SKD_REQ_STATE_BUSY; | ||
1721 | skd_send_special_fitmsg(skdev, skspcl); | ||
1722 | |||
1723 | return 0; | ||
1724 | } | ||
1725 | |||
1726 | static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio) | ||
1727 | { | ||
1728 | unsigned long flags; | ||
1729 | int rc; | ||
1730 | |||
1731 | rc = wait_event_interruptible_timeout(skdev->waitq, | ||
1732 | (sksgio->skspcl->req.state != | ||
1733 | SKD_REQ_STATE_BUSY), | ||
1734 | msecs_to_jiffies(sksgio->sg. | ||
1735 | timeout)); | ||
1736 | |||
1737 | spin_lock_irqsave(&skdev->lock, flags); | ||
1738 | |||
1739 | if (sksgio->skspcl->req.state == SKD_REQ_STATE_ABORTED) { | ||
1740 | pr_debug("%s:%s:%d skspcl %p aborted\n", | ||
1741 | skdev->name, __func__, __LINE__, sksgio->skspcl); | ||
1742 | |||
1743 | /* Build check cond, sense and let command finish. */ | ||
1744 | /* For a timeout, we must fabricate completion and sense | ||
1745 | * data to complete the command */ | ||
1746 | sksgio->skspcl->req.completion.status = | ||
1747 | SAM_STAT_CHECK_CONDITION; | ||
1748 | |||
1749 | memset(&sksgio->skspcl->req.err_info, 0, | ||
1750 | sizeof(sksgio->skspcl->req.err_info)); | ||
1751 | sksgio->skspcl->req.err_info.type = 0x70; | ||
1752 | sksgio->skspcl->req.err_info.key = ABORTED_COMMAND; | ||
1753 | sksgio->skspcl->req.err_info.code = 0x44; | ||
1754 | sksgio->skspcl->req.err_info.qual = 0; | ||
1755 | rc = 0; | ||
1756 | } else if (sksgio->skspcl->req.state != SKD_REQ_STATE_BUSY) | ||
1757 | /* No longer on the adapter. We finish. */ | ||
1758 | rc = 0; | ||
1759 | else { | ||
1760 | /* Something's gone wrong. Still busy. Timeout or | ||
1761 | * user interrupted (control-C). Mark as an orphan | ||
1762 | * so it will be disposed when completed. */ | ||
1763 | sksgio->skspcl->orphaned = 1; | ||
1764 | sksgio->skspcl = NULL; | ||
1765 | if (rc == 0) { | ||
1766 | pr_debug("%s:%s:%d timed out %p (%u ms)\n", | ||
1767 | skdev->name, __func__, __LINE__, | ||
1768 | sksgio, sksgio->sg.timeout); | ||
1769 | rc = -ETIMEDOUT; | ||
1770 | } else { | ||
1771 | pr_debug("%s:%s:%d cntlc %p\n", | ||
1772 | skdev->name, __func__, __LINE__, sksgio); | ||
1773 | rc = -EINTR; | ||
1774 | } | ||
1775 | } | ||
1776 | |||
1777 | spin_unlock_irqrestore(&skdev->lock, flags); | ||
1778 | |||
1779 | return rc; | ||
1780 | } | ||
1781 | |||
1782 | static int skd_sg_io_put_status(struct skd_device *skdev, | ||
1783 | struct skd_sg_io *sksgio) | ||
1784 | { | ||
1785 | struct sg_io_hdr *sgp = &sksgio->sg; | ||
1786 | struct skd_special_context *skspcl = sksgio->skspcl; | ||
1787 | int resid = 0; | ||
1788 | |||
1789 | u32 nb = be32_to_cpu(skspcl->req.completion.num_returned_bytes); | ||
1790 | |||
1791 | sgp->status = skspcl->req.completion.status; | ||
1792 | resid = sksgio->dxfer_len - nb; | ||
1793 | |||
1794 | sgp->masked_status = sgp->status & STATUS_MASK; | ||
1795 | sgp->msg_status = 0; | ||
1796 | sgp->host_status = 0; | ||
1797 | sgp->driver_status = 0; | ||
1798 | sgp->resid = resid; | ||
1799 | if (sgp->masked_status || sgp->host_status || sgp->driver_status) | ||
1800 | sgp->info |= SG_INFO_CHECK; | ||
1801 | |||
1802 | pr_debug("%s:%s:%d status %x masked %x resid 0x%x\n", | ||
1803 | skdev->name, __func__, __LINE__, | ||
1804 | sgp->status, sgp->masked_status, sgp->resid); | ||
1805 | |||
1806 | if (sgp->masked_status == SAM_STAT_CHECK_CONDITION) { | ||
1807 | if (sgp->mx_sb_len > 0) { | ||
1808 | struct fit_comp_error_info *ei = &skspcl->req.err_info; | ||
1809 | u32 nbytes = sizeof(*ei); | ||
1810 | |||
1811 | nbytes = min_t(u32, nbytes, sgp->mx_sb_len); | ||
1812 | |||
1813 | sgp->sb_len_wr = nbytes; | ||
1814 | |||
1815 | if (__copy_to_user(sgp->sbp, ei, nbytes)) { | ||
1816 | pr_debug("%s:%s:%d copy_to_user sense failed %p\n", | ||
1817 | skdev->name, __func__, __LINE__, | ||
1818 | sgp->sbp); | ||
1819 | return -EFAULT; | ||
1820 | } | ||
1821 | } | ||
1822 | } | ||
1823 | |||
1824 | if (__copy_to_user(sksgio->argp, sgp, sizeof(sg_io_hdr_t))) { | ||
1825 | pr_debug("%s:%s:%d copy_to_user sg failed %p\n", | ||
1826 | skdev->name, __func__, __LINE__, sksgio->argp); | ||
1827 | return -EFAULT; | ||
1828 | } | ||
1829 | |||
1830 | return 0; | ||
1831 | } | ||
1832 | |||
1833 | static int skd_sg_io_release_skspcl(struct skd_device *skdev, | ||
1834 | struct skd_sg_io *sksgio) | ||
1835 | { | ||
1836 | struct skd_special_context *skspcl = sksgio->skspcl; | ||
1837 | |||
1838 | if (skspcl != NULL) { | ||
1839 | ulong flags; | ||
1840 | |||
1841 | sksgio->skspcl = NULL; | ||
1842 | |||
1843 | spin_lock_irqsave(&skdev->lock, flags); | ||
1844 | skd_release_special(skdev, skspcl); | ||
1845 | spin_unlock_irqrestore(&skdev->lock, flags); | ||
1846 | } | ||
1847 | |||
1848 | return 0; | ||
1849 | } | ||
1850 | |||
1851 | /* | ||
1852 | ***************************************************************************** | ||
1853 | * INTERNAL REQUESTS -- generated by driver itself | ||
1854 | ***************************************************************************** | ||
1855 | */ | ||
1856 | |||
1857 | static int skd_format_internal_skspcl(struct skd_device *skdev) | ||
1858 | { | ||
1859 | struct skd_special_context *skspcl = &skdev->internal_skspcl; | ||
1860 | struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0]; | ||
1861 | struct fit_msg_hdr *fmh; | ||
1862 | uint64_t dma_address; | ||
1863 | struct skd_scsi_request *scsi; | ||
1864 | |||
1865 | fmh = (struct fit_msg_hdr *)&skspcl->msg_buf[0]; | ||
1866 | fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; | ||
1867 | fmh->num_protocol_cmds_coalesced = 1; | ||
1868 | |||
1869 | scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64]; | ||
1870 | memset(scsi, 0, sizeof(*scsi)); | ||
1871 | dma_address = skspcl->req.sksg_dma_address; | ||
1872 | scsi->hdr.sg_list_dma_address = cpu_to_be64(dma_address); | ||
1873 | sgd->control = FIT_SGD_CONTROL_LAST; | ||
1874 | sgd->byte_count = 0; | ||
1875 | sgd->host_side_addr = skspcl->db_dma_address; | ||
1876 | sgd->dev_side_addr = 0; | ||
1877 | sgd->next_desc_ptr = 0LL; | ||
1878 | |||
1879 | return 1; | ||
1880 | } | ||
1881 | |||
1882 | #define WR_BUF_SIZE SKD_N_INTERNAL_BYTES | ||
1883 | |||
1884 | static void skd_send_internal_skspcl(struct skd_device *skdev, | ||
1885 | struct skd_special_context *skspcl, | ||
1886 | u8 opcode) | ||
1887 | { | ||
1888 | struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0]; | ||
1889 | struct skd_scsi_request *scsi; | ||
1890 | unsigned char *buf = skspcl->data_buf; | ||
1891 | int i; | ||
1892 | |||
1893 | if (skspcl->req.state != SKD_REQ_STATE_IDLE) | ||
1894 | /* | ||
1895 | * A refresh is already in progress. | ||
1896 | * Just wait for it to finish. | ||
1897 | */ | ||
1898 | return; | ||
1899 | |||
1900 | SKD_ASSERT((skspcl->req.id & SKD_ID_INCR) == 0); | ||
1901 | skspcl->req.state = SKD_REQ_STATE_BUSY; | ||
1902 | skspcl->req.id += SKD_ID_INCR; | ||
1903 | |||
1904 | scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64]; | ||
1905 | scsi->hdr.tag = skspcl->req.id; | ||
1906 | |||
1907 | memset(scsi->cdb, 0, sizeof(scsi->cdb)); | ||
1908 | |||
1909 | switch (opcode) { | ||
1910 | case TEST_UNIT_READY: | ||
1911 | scsi->cdb[0] = TEST_UNIT_READY; | ||
1912 | sgd->byte_count = 0; | ||
1913 | scsi->hdr.sg_list_len_bytes = 0; | ||
1914 | break; | ||
1915 | |||
1916 | case READ_CAPACITY: | ||
1917 | scsi->cdb[0] = READ_CAPACITY; | ||
1918 | sgd->byte_count = SKD_N_READ_CAP_BYTES; | ||
1919 | scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); | ||
1920 | break; | ||
1921 | |||
1922 | case INQUIRY: | ||
1923 | scsi->cdb[0] = INQUIRY; | ||
1924 | scsi->cdb[1] = 0x01; /* evpd */ | ||
1925 | scsi->cdb[2] = 0x80; /* serial number page */ | ||
1926 | scsi->cdb[4] = 0x10; | ||
1927 | sgd->byte_count = 16; | ||
1928 | scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); | ||
1929 | break; | ||
1930 | |||
1931 | case SYNCHRONIZE_CACHE: | ||
1932 | scsi->cdb[0] = SYNCHRONIZE_CACHE; | ||
1933 | sgd->byte_count = 0; | ||
1934 | scsi->hdr.sg_list_len_bytes = 0; | ||
1935 | break; | ||
1936 | |||
1937 | case WRITE_BUFFER: | ||
1938 | scsi->cdb[0] = WRITE_BUFFER; | ||
1939 | scsi->cdb[1] = 0x02; | ||
1940 | scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8; | ||
1941 | scsi->cdb[8] = WR_BUF_SIZE & 0xFF; | ||
1942 | sgd->byte_count = WR_BUF_SIZE; | ||
1943 | scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); | ||
1944 | /* fill incrementing byte pattern */ | ||
1945 | for (i = 0; i < sgd->byte_count; i++) | ||
1946 | buf[i] = i & 0xFF; | ||
1947 | break; | ||
1948 | |||
1949 | case READ_BUFFER: | ||
1950 | scsi->cdb[0] = READ_BUFFER; | ||
1951 | scsi->cdb[1] = 0x02; | ||
1952 | scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8; | ||
1953 | scsi->cdb[8] = WR_BUF_SIZE & 0xFF; | ||
1954 | sgd->byte_count = WR_BUF_SIZE; | ||
1955 | scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); | ||
1956 | memset(skspcl->data_buf, 0, sgd->byte_count); | ||
1957 | break; | ||
1958 | |||
1959 | default: | ||
1960 | SKD_ASSERT("Don't know what to send"); | ||
1961 | return; | ||
1962 | |||
1963 | } | ||
1964 | skd_send_special_fitmsg(skdev, skspcl); | ||
1965 | } | ||
1966 | |||
1967 | static void skd_refresh_device_data(struct skd_device *skdev) | ||
1968 | { | ||
1969 | struct skd_special_context *skspcl = &skdev->internal_skspcl; | ||
1970 | |||
1971 | skd_send_internal_skspcl(skdev, skspcl, TEST_UNIT_READY); | ||
1972 | } | ||
1973 | |||
1974 | static int skd_chk_read_buf(struct skd_device *skdev, | ||
1975 | struct skd_special_context *skspcl) | ||
1976 | { | ||
1977 | unsigned char *buf = skspcl->data_buf; | ||
1978 | int i; | ||
1979 | |||
1980 | /* check for incrementing byte pattern */ | ||
1981 | for (i = 0; i < WR_BUF_SIZE; i++) | ||
1982 | if (buf[i] != (i & 0xFF)) | ||
1983 | return 1; | ||
1984 | |||
1985 | return 0; | ||
1986 | } | ||
1987 | |||
1988 | static void skd_log_check_status(struct skd_device *skdev, u8 status, u8 key, | ||
1989 | u8 code, u8 qual, u8 fruc) | ||
1990 | { | ||
1991 | /* If the check condition is of special interest, log a message */ | ||
1992 | if ((status == SAM_STAT_CHECK_CONDITION) && (key == 0x02) | ||
1993 | && (code == 0x04) && (qual == 0x06)) { | ||
1994 | pr_err("(%s): *** LOST_WRITE_DATA ERROR *** key/asc/" | ||
1995 | "ascq/fruc %02x/%02x/%02x/%02x\n", | ||
1996 | skd_name(skdev), key, code, qual, fruc); | ||
1997 | } | ||
1998 | } | ||
1999 | |||
2000 | static void skd_complete_internal(struct skd_device *skdev, | ||
2001 | volatile struct fit_completion_entry_v1 | ||
2002 | *skcomp, | ||
2003 | volatile struct fit_comp_error_info *skerr, | ||
2004 | struct skd_special_context *skspcl) | ||
2005 | { | ||
2006 | u8 *buf = skspcl->data_buf; | ||
2007 | u8 status; | ||
2008 | int i; | ||
2009 | struct skd_scsi_request *scsi = | ||
2010 | (struct skd_scsi_request *)&skspcl->msg_buf[64]; | ||
2011 | |||
2012 | SKD_ASSERT(skspcl == &skdev->internal_skspcl); | ||
2013 | |||
2014 | pr_debug("%s:%s:%d complete internal %x\n", | ||
2015 | skdev->name, __func__, __LINE__, scsi->cdb[0]); | ||
2016 | |||
2017 | skspcl->req.completion = *skcomp; | ||
2018 | skspcl->req.state = SKD_REQ_STATE_IDLE; | ||
2019 | skspcl->req.id += SKD_ID_INCR; | ||
2020 | |||
2021 | status = skspcl->req.completion.status; | ||
2022 | |||
2023 | skd_log_check_status(skdev, status, skerr->key, skerr->code, | ||
2024 | skerr->qual, skerr->fruc); | ||
2025 | |||
2026 | switch (scsi->cdb[0]) { | ||
2027 | case TEST_UNIT_READY: | ||
2028 | if (status == SAM_STAT_GOOD) | ||
2029 | skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER); | ||
2030 | else if ((status == SAM_STAT_CHECK_CONDITION) && | ||
2031 | (skerr->key == MEDIUM_ERROR)) | ||
2032 | skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER); | ||
2033 | else { | ||
2034 | if (skdev->state == SKD_DRVR_STATE_STOPPING) { | ||
2035 | pr_debug("%s:%s:%d TUR failed, don't send anymore state 0x%x\n", | ||
2036 | skdev->name, __func__, __LINE__, | ||
2037 | skdev->state); | ||
2038 | return; | ||
2039 | } | ||
2040 | pr_debug("%s:%s:%d **** TUR failed, retry skerr\n", | ||
2041 | skdev->name, __func__, __LINE__); | ||
2042 | skd_send_internal_skspcl(skdev, skspcl, 0x00); | ||
2043 | } | ||
2044 | break; | ||
2045 | |||
2046 | case WRITE_BUFFER: | ||
2047 | if (status == SAM_STAT_GOOD) | ||
2048 | skd_send_internal_skspcl(skdev, skspcl, READ_BUFFER); | ||
2049 | else { | ||
2050 | if (skdev->state == SKD_DRVR_STATE_STOPPING) { | ||
2051 | pr_debug("%s:%s:%d write buffer failed, don't send anymore state 0x%x\n", | ||
2052 | skdev->name, __func__, __LINE__, | ||
2053 | skdev->state); | ||
2054 | return; | ||
2055 | } | ||
2056 | pr_debug("%s:%s:%d **** write buffer failed, retry skerr\n", | ||
2057 | skdev->name, __func__, __LINE__); | ||
2058 | skd_send_internal_skspcl(skdev, skspcl, 0x00); | ||
2059 | } | ||
2060 | break; | ||
2061 | |||
2062 | case READ_BUFFER: | ||
2063 | if (status == SAM_STAT_GOOD) { | ||
2064 | if (skd_chk_read_buf(skdev, skspcl) == 0) | ||
2065 | skd_send_internal_skspcl(skdev, skspcl, | ||
2066 | READ_CAPACITY); | ||
2067 | else { | ||
2068 | pr_err( | ||
2069 | "(%s):*** W/R Buffer mismatch %d ***\n", | ||
2070 | skd_name(skdev), skdev->connect_retries); | ||
2071 | if (skdev->connect_retries < | ||
2072 | SKD_MAX_CONNECT_RETRIES) { | ||
2073 | skdev->connect_retries++; | ||
2074 | skd_soft_reset(skdev); | ||
2075 | } else { | ||
2076 | pr_err( | ||
2077 | "(%s): W/R Buffer Connect Error\n", | ||
2078 | skd_name(skdev)); | ||
2079 | return; | ||
2080 | } | ||
2081 | } | ||
2082 | |||
2083 | } else { | ||
2084 | if (skdev->state == SKD_DRVR_STATE_STOPPING) { | ||
2085 | pr_debug("%s:%s:%d " | ||
2086 | "read buffer failed, don't send anymore state 0x%x\n", | ||
2087 | skdev->name, __func__, __LINE__, | ||
2088 | skdev->state); | ||
2089 | return; | ||
2090 | } | ||
2091 | pr_debug("%s:%s:%d " | ||
2092 | "**** read buffer failed, retry skerr\n", | ||
2093 | skdev->name, __func__, __LINE__); | ||
2094 | skd_send_internal_skspcl(skdev, skspcl, 0x00); | ||
2095 | } | ||
2096 | break; | ||
2097 | |||
2098 | case READ_CAPACITY: | ||
2099 | skdev->read_cap_is_valid = 0; | ||
2100 | if (status == SAM_STAT_GOOD) { | ||
2101 | skdev->read_cap_last_lba = | ||
2102 | (buf[0] << 24) | (buf[1] << 16) | | ||
2103 | (buf[2] << 8) | buf[3]; | ||
2104 | skdev->read_cap_blocksize = | ||
2105 | (buf[4] << 24) | (buf[5] << 16) | | ||
2106 | (buf[6] << 8) | buf[7]; | ||
2107 | |||
2108 | pr_debug("%s:%s:%d last lba %d, bs %d\n", | ||
2109 | skdev->name, __func__, __LINE__, | ||
2110 | skdev->read_cap_last_lba, | ||
2111 | skdev->read_cap_blocksize); | ||
2112 | |||
2113 | set_capacity(skdev->disk, skdev->read_cap_last_lba + 1); | ||
2114 | |||
2115 | skdev->read_cap_is_valid = 1; | ||
2116 | |||
2117 | skd_send_internal_skspcl(skdev, skspcl, INQUIRY); | ||
2118 | } else if ((status == SAM_STAT_CHECK_CONDITION) && | ||
2119 | (skerr->key == MEDIUM_ERROR)) { | ||
2120 | skdev->read_cap_last_lba = ~0; | ||
2121 | set_capacity(skdev->disk, skdev->read_cap_last_lba + 1); | ||
2122 | pr_debug("%s:%s:%d " | ||
2123 | "**** MEDIUM ERROR caused READCAP to fail, ignore failure and continue to inquiry\n", | ||
2124 | skdev->name, __func__, __LINE__); | ||
2125 | skd_send_internal_skspcl(skdev, skspcl, INQUIRY); | ||
2126 | } else { | ||
2127 | pr_debug("%s:%s:%d **** READCAP failed, retry TUR\n", | ||
2128 | skdev->name, __func__, __LINE__); | ||
2129 | skd_send_internal_skspcl(skdev, skspcl, | ||
2130 | TEST_UNIT_READY); | ||
2131 | } | ||
2132 | break; | ||
2133 | |||
2134 | case INQUIRY: | ||
2135 | skdev->inquiry_is_valid = 0; | ||
2136 | if (status == SAM_STAT_GOOD) { | ||
2137 | skdev->inquiry_is_valid = 1; | ||
2138 | |||
2139 | for (i = 0; i < 12; i++) | ||
2140 | skdev->inq_serial_num[i] = buf[i + 4]; | ||
2141 | skdev->inq_serial_num[12] = 0; | ||
2142 | } | ||
2143 | |||
2144 | if (skd_unquiesce_dev(skdev) < 0) | ||
2145 | pr_debug("%s:%s:%d **** failed, to ONLINE device\n", | ||
2146 | skdev->name, __func__, __LINE__); | ||
2147 | /* connection is complete */ | ||
2148 | skdev->connect_retries = 0; | ||
2149 | break; | ||
2150 | |||
2151 | case SYNCHRONIZE_CACHE: | ||
2152 | if (status == SAM_STAT_GOOD) | ||
2153 | skdev->sync_done = 1; | ||
2154 | else | ||
2155 | skdev->sync_done = -1; | ||
2156 | wake_up_interruptible(&skdev->waitq); | ||
2157 | break; | ||
2158 | |||
2159 | default: | ||
2160 | SKD_ASSERT("we didn't send this"); | ||
2161 | } | ||
2162 | } | ||
2163 | |||
2164 | /* | ||
2165 | ***************************************************************************** | ||
2166 | * FIT MESSAGES | ||
2167 | ***************************************************************************** | ||
2168 | */ | ||
2169 | |||
2170 | static void skd_send_fitmsg(struct skd_device *skdev, | ||
2171 | struct skd_fitmsg_context *skmsg) | ||
2172 | { | ||
2173 | u64 qcmd; | ||
2174 | struct fit_msg_hdr *fmh; | ||
2175 | |||
2176 | pr_debug("%s:%s:%d dma address 0x%llx, busy=%d\n", | ||
2177 | skdev->name, __func__, __LINE__, | ||
2178 | skmsg->mb_dma_address, skdev->in_flight); | ||
2179 | pr_debug("%s:%s:%d msg_buf 0x%p, offset %x\n", | ||
2180 | skdev->name, __func__, __LINE__, | ||
2181 | skmsg->msg_buf, skmsg->offset); | ||
2182 | |||
2183 | qcmd = skmsg->mb_dma_address; | ||
2184 | qcmd |= FIT_QCMD_QID_NORMAL; | ||
2185 | |||
2186 | fmh = (struct fit_msg_hdr *)skmsg->msg_buf; | ||
2187 | skmsg->outstanding = fmh->num_protocol_cmds_coalesced; | ||
2188 | |||
2189 | if (unlikely(skdev->dbg_level > 1)) { | ||
2190 | u8 *bp = (u8 *)skmsg->msg_buf; | ||
2191 | int i; | ||
2192 | for (i = 0; i < skmsg->length; i += 8) { | ||
2193 | pr_debug("%s:%s:%d msg[%2d] %02x %02x %02x %02x " | ||
2194 | "%02x %02x %02x %02x\n", | ||
2195 | skdev->name, __func__, __LINE__, | ||
2196 | i, bp[i + 0], bp[i + 1], bp[i + 2], | ||
2197 | bp[i + 3], bp[i + 4], bp[i + 5], | ||
2198 | bp[i + 6], bp[i + 7]); | ||
2199 | if (i == 0) | ||
2200 | i = 64 - 8; | ||
2201 | } | ||
2202 | } | ||
2203 | |||
2204 | if (skmsg->length > 256) | ||
2205 | qcmd |= FIT_QCMD_MSGSIZE_512; | ||
2206 | else if (skmsg->length > 128) | ||
2207 | qcmd |= FIT_QCMD_MSGSIZE_256; | ||
2208 | else if (skmsg->length > 64) | ||
2209 | qcmd |= FIT_QCMD_MSGSIZE_128; | ||
2210 | else | ||
2211 | /* | ||
2212 | * This makes no sense because the FIT msg header is | ||
2213 | * 64 bytes. If the msg is only 64 bytes long it has | ||
2214 | * no payload. | ||
2215 | */ | ||
2216 | qcmd |= FIT_QCMD_MSGSIZE_64; | ||
2217 | |||
2218 | SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); | ||
2219 | |||
2220 | } | ||
2221 | |||
2222 | static void skd_send_special_fitmsg(struct skd_device *skdev, | ||
2223 | struct skd_special_context *skspcl) | ||
2224 | { | ||
2225 | u64 qcmd; | ||
2226 | |||
2227 | if (unlikely(skdev->dbg_level > 1)) { | ||
2228 | u8 *bp = (u8 *)skspcl->msg_buf; | ||
2229 | int i; | ||
2230 | |||
2231 | for (i = 0; i < SKD_N_SPECIAL_FITMSG_BYTES; i += 8) { | ||
2232 | pr_debug("%s:%s:%d spcl[%2d] %02x %02x %02x %02x " | ||
2233 | "%02x %02x %02x %02x\n", | ||
2234 | skdev->name, __func__, __LINE__, i, | ||
2235 | bp[i + 0], bp[i + 1], bp[i + 2], bp[i + 3], | ||
2236 | bp[i + 4], bp[i + 5], bp[i + 6], bp[i + 7]); | ||
2237 | if (i == 0) | ||
2238 | i = 64 - 8; | ||
2239 | } | ||
2240 | |||
2241 | pr_debug("%s:%s:%d skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n", | ||
2242 | skdev->name, __func__, __LINE__, | ||
2243 | skspcl, skspcl->req.id, skspcl->req.sksg_list, | ||
2244 | skspcl->req.sksg_dma_address); | ||
2245 | for (i = 0; i < skspcl->req.n_sg; i++) { | ||
2246 | struct fit_sg_descriptor *sgd = | ||
2247 | &skspcl->req.sksg_list[i]; | ||
2248 | |||
2249 | pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x " | ||
2250 | "addr=0x%llx next=0x%llx\n", | ||
2251 | skdev->name, __func__, __LINE__, | ||
2252 | i, sgd->byte_count, sgd->control, | ||
2253 | sgd->host_side_addr, sgd->next_desc_ptr); | ||
2254 | } | ||
2255 | } | ||
2256 | |||
2257 | /* | ||
2258 | * Special FIT msgs are always 128 bytes: a 64-byte FIT hdr | ||
2259 | * and one 64-byte SSDI command. | ||
2260 | */ | ||
2261 | qcmd = skspcl->mb_dma_address; | ||
2262 | qcmd |= FIT_QCMD_QID_NORMAL + FIT_QCMD_MSGSIZE_128; | ||
2263 | |||
2264 | SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); | ||
2265 | } | ||
2266 | |||
2267 | /* | ||
2268 | ***************************************************************************** | ||
2269 | * COMPLETION QUEUE | ||
2270 | ***************************************************************************** | ||
2271 | */ | ||
2272 | |||
2273 | static void skd_complete_other(struct skd_device *skdev, | ||
2274 | volatile struct fit_completion_entry_v1 *skcomp, | ||
2275 | volatile struct fit_comp_error_info *skerr); | ||
2276 | |||
2277 | struct sns_info { | ||
2278 | u8 type; | ||
2279 | u8 stat; | ||
2280 | u8 key; | ||
2281 | u8 asc; | ||
2282 | u8 ascq; | ||
2283 | u8 mask; | ||
2284 | enum skd_check_status_action action; | ||
2285 | }; | ||
2286 | |||
2287 | static struct sns_info skd_chkstat_table[] = { | ||
2288 | /* Good */ | ||
2289 | { 0x70, 0x02, RECOVERED_ERROR, 0, 0, 0x1c, | ||
2290 | SKD_CHECK_STATUS_REPORT_GOOD }, | ||
2291 | |||
2292 | /* Smart alerts */ | ||
2293 | { 0x70, 0x02, NO_SENSE, 0x0B, 0x00, 0x1E, /* warnings */ | ||
2294 | SKD_CHECK_STATUS_REPORT_SMART_ALERT }, | ||
2295 | { 0x70, 0x02, NO_SENSE, 0x5D, 0x00, 0x1E, /* thresholds */ | ||
2296 | SKD_CHECK_STATUS_REPORT_SMART_ALERT }, | ||
2297 | { 0x70, 0x02, RECOVERED_ERROR, 0x0B, 0x01, 0x1F, /* temperature over trigger */ | ||
2298 | SKD_CHECK_STATUS_REPORT_SMART_ALERT }, | ||
2299 | |||
2300 | /* Retry (with limits) */ | ||
2301 | { 0x70, 0x02, 0x0B, 0, 0, 0x1C, /* This one is for DMA ERROR */ | ||
2302 | SKD_CHECK_STATUS_REQUEUE_REQUEST }, | ||
2303 | { 0x70, 0x02, 0x06, 0x0B, 0x00, 0x1E, /* warnings */ | ||
2304 | SKD_CHECK_STATUS_REQUEUE_REQUEST }, | ||
2305 | { 0x70, 0x02, 0x06, 0x5D, 0x00, 0x1E, /* thresholds */ | ||
2306 | SKD_CHECK_STATUS_REQUEUE_REQUEST }, | ||
2307 | { 0x70, 0x02, 0x06, 0x80, 0x30, 0x1F, /* backup power */ | ||
2308 | SKD_CHECK_STATUS_REQUEUE_REQUEST }, | ||
2309 | |||
2310 | /* Busy (or about to be) */ | ||
2311 | { 0x70, 0x02, 0x06, 0x3f, 0x01, 0x1F, /* fw changed */ | ||
2312 | SKD_CHECK_STATUS_BUSY_IMMINENT }, | ||
2313 | }; | ||
2314 | |||
2315 | /* | ||
2316 | * Look up status and sense data to decide how to handle the error | ||
2317 | * from the device. | ||
2318 | * mask says which fields must match e.g., mask=0x18 means check | ||
2319 | * type and stat, ignore key, asc, ascq. | ||
2320 | */ | ||
2321 | |||
2322 | static enum skd_check_status_action | ||
2323 | skd_check_status(struct skd_device *skdev, | ||
2324 | u8 cmp_status, volatile struct fit_comp_error_info *skerr) | ||
2325 | { | ||
2326 | int i, n; | ||
2327 | |||
2328 | pr_err("(%s): key/asc/ascq/fruc %02x/%02x/%02x/%02x\n", | ||
2329 | skd_name(skdev), skerr->key, skerr->code, skerr->qual, | ||
2330 | skerr->fruc); | ||
2331 | |||
2332 | pr_debug("%s:%s:%d stat: t=%02x stat=%02x k=%02x c=%02x q=%02x fruc=%02x\n", | ||
2333 | skdev->name, __func__, __LINE__, skerr->type, cmp_status, | ||
2334 | skerr->key, skerr->code, skerr->qual, skerr->fruc); | ||
2335 | |||
2336 | /* Does the info match an entry in the good category? */ | ||
2337 | n = sizeof(skd_chkstat_table) / sizeof(skd_chkstat_table[0]); | ||
2338 | for (i = 0; i < n; i++) { | ||
2339 | struct sns_info *sns = &skd_chkstat_table[i]; | ||
2340 | |||
2341 | if (sns->mask & 0x10) | ||
2342 | if (skerr->type != sns->type) | ||
2343 | continue; | ||
2344 | |||
2345 | if (sns->mask & 0x08) | ||
2346 | if (cmp_status != sns->stat) | ||
2347 | continue; | ||
2348 | |||
2349 | if (sns->mask & 0x04) | ||
2350 | if (skerr->key != sns->key) | ||
2351 | continue; | ||
2352 | |||
2353 | if (sns->mask & 0x02) | ||
2354 | if (skerr->code != sns->asc) | ||
2355 | continue; | ||
2356 | |||
2357 | if (sns->mask & 0x01) | ||
2358 | if (skerr->qual != sns->ascq) | ||
2359 | continue; | ||
2360 | |||
2361 | if (sns->action == SKD_CHECK_STATUS_REPORT_SMART_ALERT) { | ||
2362 | pr_err("(%s): SMART Alert: sense key/asc/ascq " | ||
2363 | "%02x/%02x/%02x\n", | ||
2364 | skd_name(skdev), skerr->key, | ||
2365 | skerr->code, skerr->qual); | ||
2366 | } | ||
2367 | return sns->action; | ||
2368 | } | ||
2369 | |||
2370 | /* No other match, so nonzero status means error, | ||
2371 | * zero status means good | ||
2372 | */ | ||
2373 | if (cmp_status) { | ||
2374 | pr_debug("%s:%s:%d status check: error\n", | ||
2375 | skdev->name, __func__, __LINE__); | ||
2376 | return SKD_CHECK_STATUS_REPORT_ERROR; | ||
2377 | } | ||
2378 | |||
2379 | pr_debug("%s:%s:%d status check good default\n", | ||
2380 | skdev->name, __func__, __LINE__); | ||
2381 | return SKD_CHECK_STATUS_REPORT_GOOD; | ||
2382 | } | ||
2383 | |||
2384 | static void skd_resolve_req_exception(struct skd_device *skdev, | ||
2385 | struct skd_request_context *skreq) | ||
2386 | { | ||
2387 | u8 cmp_status = skreq->completion.status; | ||
2388 | |||
2389 | switch (skd_check_status(skdev, cmp_status, &skreq->err_info)) { | ||
2390 | case SKD_CHECK_STATUS_REPORT_GOOD: | ||
2391 | case SKD_CHECK_STATUS_REPORT_SMART_ALERT: | ||
2392 | skd_end_request(skdev, skreq, 0); | ||
2393 | break; | ||
2394 | |||
2395 | case SKD_CHECK_STATUS_BUSY_IMMINENT: | ||
2396 | skd_log_skreq(skdev, skreq, "retry(busy)"); | ||
2397 | blk_requeue_request(skdev->queue, skreq->req); | ||
2398 | pr_info("(%s) drive BUSY imminent\n", skd_name(skdev)); | ||
2399 | skdev->state = SKD_DRVR_STATE_BUSY_IMMINENT; | ||
2400 | skdev->timer_countdown = SKD_TIMER_MINUTES(20); | ||
2401 | skd_quiesce_dev(skdev); | ||
2402 | break; | ||
2403 | |||
2404 | case SKD_CHECK_STATUS_REQUEUE_REQUEST: | ||
2405 | if ((unsigned long) ++skreq->req->special < SKD_MAX_RETRIES) { | ||
2406 | skd_log_skreq(skdev, skreq, "retry"); | ||
2407 | blk_requeue_request(skdev->queue, skreq->req); | ||
2408 | break; | ||
2409 | } | ||
2410 | /* fall through to report error */ | ||
2411 | |||
2412 | case SKD_CHECK_STATUS_REPORT_ERROR: | ||
2413 | default: | ||
2414 | skd_end_request(skdev, skreq, -EIO); | ||
2415 | break; | ||
2416 | } | ||
2417 | } | ||
2418 | |||
2419 | /* assume spinlock is already held */ | ||
2420 | static void skd_release_skreq(struct skd_device *skdev, | ||
2421 | struct skd_request_context *skreq) | ||
2422 | { | ||
2423 | u32 msg_slot; | ||
2424 | struct skd_fitmsg_context *skmsg; | ||
2425 | |||
2426 | u32 timo_slot; | ||
2427 | |||
2428 | /* | ||
2429 | * Reclaim the FIT msg buffer if this is | ||
2430 | * the first of the requests it carried to | ||
2431 | * be completed. The FIT msg buffer used to | ||
2432 | * send this request cannot be reused until | ||
2433 | * we are sure the s1120 card has copied | ||
2434 | * it to its memory. The FIT msg might have | ||
2435 | * contained several requests. As soon as | ||
2436 | * any of them are completed we know that | ||
2437 | * the entire FIT msg was transferred. | ||
2438 | * Only the first completed request will | ||
2439 | * match the FIT msg buffer id. The FIT | ||
2440 | * msg buffer id is immediately updated. | ||
2441 | * When subsequent requests complete the FIT | ||
2442 | * msg buffer id won't match, so we know | ||
2443 | * quite cheaply that it is already done. | ||
2444 | */ | ||
2445 | msg_slot = skreq->fitmsg_id & SKD_ID_SLOT_MASK; | ||
2446 | SKD_ASSERT(msg_slot < skdev->num_fitmsg_context); | ||
2447 | |||
2448 | skmsg = &skdev->skmsg_table[msg_slot]; | ||
2449 | if (skmsg->id == skreq->fitmsg_id) { | ||
2450 | SKD_ASSERT(skmsg->state == SKD_MSG_STATE_BUSY); | ||
2451 | SKD_ASSERT(skmsg->outstanding > 0); | ||
2452 | skmsg->outstanding--; | ||
2453 | if (skmsg->outstanding == 0) { | ||
2454 | skmsg->state = SKD_MSG_STATE_IDLE; | ||
2455 | skmsg->id += SKD_ID_INCR; | ||
2456 | skmsg->next = skdev->skmsg_free_list; | ||
2457 | skdev->skmsg_free_list = skmsg; | ||
2458 | } | ||
2459 | } | ||
2460 | |||
2461 | /* | ||
2462 | * Decrease the number of active requests. | ||
2463 | * Also decrements the count in the timeout slot. | ||
2464 | */ | ||
2465 | SKD_ASSERT(skdev->in_flight > 0); | ||
2466 | skdev->in_flight -= 1; | ||
2467 | |||
2468 | timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; | ||
2469 | SKD_ASSERT(skdev->timeout_slot[timo_slot] > 0); | ||
2470 | skdev->timeout_slot[timo_slot] -= 1; | ||
2471 | |||
2472 | /* | ||
2473 | * Reset backpointer | ||
2474 | */ | ||
2475 | skreq->req = NULL; | ||
2476 | |||
2477 | /* | ||
2478 | * Reclaim the skd_request_context | ||
2479 | */ | ||
2480 | skreq->state = SKD_REQ_STATE_IDLE; | ||
2481 | skreq->id += SKD_ID_INCR; | ||
2482 | skreq->next = skdev->skreq_free_list; | ||
2483 | skdev->skreq_free_list = skreq; | ||
2484 | } | ||
2485 | |||
2486 | #define DRIVER_INQ_EVPD_PAGE_CODE 0xDA | ||
2487 | |||
2488 | static void skd_do_inq_page_00(struct skd_device *skdev, | ||
2489 | volatile struct fit_completion_entry_v1 *skcomp, | ||
2490 | volatile struct fit_comp_error_info *skerr, | ||
2491 | uint8_t *cdb, uint8_t *buf) | ||
2492 | { | ||
2493 | uint16_t insert_pt, max_bytes, drive_pages, drive_bytes, new_size; | ||
2494 | |||
2495 | /* Caller requested "supported pages". The driver needs to insert | ||
2496 | * its page. | ||
2497 | */ | ||
2498 | pr_debug("%s:%s:%d skd_do_driver_inquiry: modify supported pages.\n", | ||
2499 | skdev->name, __func__, __LINE__); | ||
2500 | |||
2501 | /* If the device rejected the request because the CDB was | ||
2502 | * improperly formed, then just leave. | ||
2503 | */ | ||
2504 | if (skcomp->status == SAM_STAT_CHECK_CONDITION && | ||
2505 | skerr->key == ILLEGAL_REQUEST && skerr->code == 0x24) | ||
2506 | return; | ||
2507 | |||
2508 | /* Get the amount of space the caller allocated */ | ||
2509 | max_bytes = (cdb[3] << 8) | cdb[4]; | ||
2510 | |||
2511 | /* Get the number of pages actually returned by the device */ | ||
2512 | drive_pages = (buf[2] << 8) | buf[3]; | ||
2513 | drive_bytes = drive_pages + 4; | ||
2514 | new_size = drive_pages + 1; | ||
2515 | |||
2516 | /* Supported pages must be in numerical order, so find where | ||
2517 | * the driver page needs to be inserted into the list of | ||
2518 | * pages returned by the device. | ||
2519 | */ | ||
2520 | for (insert_pt = 4; insert_pt < drive_bytes; insert_pt++) { | ||
2521 | if (buf[insert_pt] == DRIVER_INQ_EVPD_PAGE_CODE) | ||
2522 | return; /* Device using this page code. abort */ | ||
2523 | else if (buf[insert_pt] > DRIVER_INQ_EVPD_PAGE_CODE) | ||
2524 | break; | ||
2525 | } | ||
2526 | |||
2527 | if (insert_pt < max_bytes) { | ||
2528 | uint16_t u; | ||
2529 | |||
2530 | /* Shift everything up one byte to make room. */ | ||
2531 | for (u = new_size + 3; u > insert_pt; u--) | ||
2532 | buf[u] = buf[u - 1]; | ||
2533 | buf[insert_pt] = DRIVER_INQ_EVPD_PAGE_CODE; | ||
2534 | |||
2535 | /* SCSI byte order increment of num_returned_bytes by 1 */ | ||
2536 | skcomp->num_returned_bytes = | ||
2537 | be32_to_cpu(skcomp->num_returned_bytes) + 1; | ||
2538 | skcomp->num_returned_bytes = | ||
2539 | be32_to_cpu(skcomp->num_returned_bytes); | ||
2540 | } | ||
2541 | |||
2542 | /* update page length field to reflect the driver's page too */ | ||
2543 | buf[2] = (uint8_t)((new_size >> 8) & 0xFF); | ||
2544 | buf[3] = (uint8_t)((new_size >> 0) & 0xFF); | ||
2545 | } | ||
2546 | |||
2547 | static void skd_get_link_info(struct pci_dev *pdev, u8 *speed, u8 *width) | ||
2548 | { | ||
2549 | int pcie_reg; | ||
2550 | u16 pci_bus_speed; | ||
2551 | u8 pci_lanes; | ||
2552 | |||
2553 | pcie_reg = pci_find_capability(pdev, PCI_CAP_ID_EXP); | ||
2554 | if (pcie_reg) { | ||
2555 | u16 linksta; | ||
2556 | pci_read_config_word(pdev, pcie_reg + PCI_EXP_LNKSTA, &linksta); | ||
2557 | |||
2558 | pci_bus_speed = linksta & 0xF; | ||
2559 | pci_lanes = (linksta & 0x3F0) >> 4; | ||
2560 | } else { | ||
2561 | *speed = STEC_LINK_UNKNOWN; | ||
2562 | *width = 0xFF; | ||
2563 | return; | ||
2564 | } | ||
2565 | |||
2566 | switch (pci_bus_speed) { | ||
2567 | case 1: | ||
2568 | *speed = STEC_LINK_2_5GTS; | ||
2569 | break; | ||
2570 | case 2: | ||
2571 | *speed = STEC_LINK_5GTS; | ||
2572 | break; | ||
2573 | case 3: | ||
2574 | *speed = STEC_LINK_8GTS; | ||
2575 | break; | ||
2576 | default: | ||
2577 | *speed = STEC_LINK_UNKNOWN; | ||
2578 | break; | ||
2579 | } | ||
2580 | |||
2581 | if (pci_lanes <= 0x20) | ||
2582 | *width = pci_lanes; | ||
2583 | else | ||
2584 | *width = 0xFF; | ||
2585 | } | ||
2586 | |||
2587 | static void skd_do_inq_page_da(struct skd_device *skdev, | ||
2588 | volatile struct fit_completion_entry_v1 *skcomp, | ||
2589 | volatile struct fit_comp_error_info *skerr, | ||
2590 | uint8_t *cdb, uint8_t *buf) | ||
2591 | { | ||
2592 | struct pci_dev *pdev = skdev->pdev; | ||
2593 | unsigned max_bytes; | ||
2594 | struct driver_inquiry_data inq; | ||
2595 | u16 val; | ||
2596 | |||
2597 | pr_debug("%s:%s:%d skd_do_driver_inquiry: return driver page\n", | ||
2598 | skdev->name, __func__, __LINE__); | ||
2599 | |||
2600 | memset(&inq, 0, sizeof(inq)); | ||
2601 | |||
2602 | inq.page_code = DRIVER_INQ_EVPD_PAGE_CODE; | ||
2603 | |||
2604 | skd_get_link_info(pdev, &inq.pcie_link_speed, &inq.pcie_link_lanes); | ||
2605 | inq.pcie_bus_number = cpu_to_be16(pdev->bus->number); | ||
2606 | inq.pcie_device_number = PCI_SLOT(pdev->devfn); | ||
2607 | inq.pcie_function_number = PCI_FUNC(pdev->devfn); | ||
2608 | |||
2609 | pci_read_config_word(pdev, PCI_VENDOR_ID, &val); | ||
2610 | inq.pcie_vendor_id = cpu_to_be16(val); | ||
2611 | |||
2612 | pci_read_config_word(pdev, PCI_DEVICE_ID, &val); | ||
2613 | inq.pcie_device_id = cpu_to_be16(val); | ||
2614 | |||
2615 | pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID, &val); | ||
2616 | inq.pcie_subsystem_vendor_id = cpu_to_be16(val); | ||
2617 | |||
2618 | pci_read_config_word(pdev, PCI_SUBSYSTEM_ID, &val); | ||
2619 | inq.pcie_subsystem_device_id = cpu_to_be16(val); | ||
2620 | |||
2621 | /* Driver version, fixed lenth, padded with spaces on the right */ | ||
2622 | inq.driver_version_length = sizeof(inq.driver_version); | ||
2623 | memset(&inq.driver_version, ' ', sizeof(inq.driver_version)); | ||
2624 | memcpy(inq.driver_version, DRV_VER_COMPL, | ||
2625 | min(sizeof(inq.driver_version), strlen(DRV_VER_COMPL))); | ||
2626 | |||
2627 | inq.page_length = cpu_to_be16((sizeof(inq) - 4)); | ||
2628 | |||
2629 | /* Clear the error set by the device */ | ||
2630 | skcomp->status = SAM_STAT_GOOD; | ||
2631 | memset((void *)skerr, 0, sizeof(*skerr)); | ||
2632 | |||
2633 | /* copy response into output buffer */ | ||
2634 | max_bytes = (cdb[3] << 8) | cdb[4]; | ||
2635 | memcpy(buf, &inq, min_t(unsigned, max_bytes, sizeof(inq))); | ||
2636 | |||
2637 | skcomp->num_returned_bytes = | ||
2638 | be32_to_cpu(min_t(uint16_t, max_bytes, sizeof(inq))); | ||
2639 | } | ||
2640 | |||
2641 | static void skd_do_driver_inq(struct skd_device *skdev, | ||
2642 | volatile struct fit_completion_entry_v1 *skcomp, | ||
2643 | volatile struct fit_comp_error_info *skerr, | ||
2644 | uint8_t *cdb, uint8_t *buf) | ||
2645 | { | ||
2646 | if (!buf) | ||
2647 | return; | ||
2648 | else if (cdb[0] != INQUIRY) | ||
2649 | return; /* Not an INQUIRY */ | ||
2650 | else if ((cdb[1] & 1) == 0) | ||
2651 | return; /* EVPD not set */ | ||
2652 | else if (cdb[2] == 0) | ||
2653 | /* Need to add driver's page to supported pages list */ | ||
2654 | skd_do_inq_page_00(skdev, skcomp, skerr, cdb, buf); | ||
2655 | else if (cdb[2] == DRIVER_INQ_EVPD_PAGE_CODE) | ||
2656 | /* Caller requested driver's page */ | ||
2657 | skd_do_inq_page_da(skdev, skcomp, skerr, cdb, buf); | ||
2658 | } | ||
2659 | |||
2660 | static unsigned char *skd_sg_1st_page_ptr(struct scatterlist *sg) | ||
2661 | { | ||
2662 | if (!sg) | ||
2663 | return NULL; | ||
2664 | if (!sg_page(sg)) | ||
2665 | return NULL; | ||
2666 | return sg_virt(sg); | ||
2667 | } | ||
2668 | |||
2669 | static void skd_process_scsi_inq(struct skd_device *skdev, | ||
2670 | volatile struct fit_completion_entry_v1 | ||
2671 | *skcomp, | ||
2672 | volatile struct fit_comp_error_info *skerr, | ||
2673 | struct skd_special_context *skspcl) | ||
2674 | { | ||
2675 | uint8_t *buf; | ||
2676 | struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf; | ||
2677 | struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1]; | ||
2678 | |||
2679 | dma_sync_sg_for_cpu(skdev->class_dev, skspcl->req.sg, skspcl->req.n_sg, | ||
2680 | skspcl->req.sg_data_dir); | ||
2681 | buf = skd_sg_1st_page_ptr(skspcl->req.sg); | ||
2682 | |||
2683 | if (buf) | ||
2684 | skd_do_driver_inq(skdev, skcomp, skerr, scsi_req->cdb, buf); | ||
2685 | } | ||
2686 | |||
2687 | |||
2688 | static int skd_isr_completion_posted(struct skd_device *skdev, | ||
2689 | int limit, int *enqueued) | ||
2690 | { | ||
2691 | volatile struct fit_completion_entry_v1 *skcmp = NULL; | ||
2692 | volatile struct fit_comp_error_info *skerr; | ||
2693 | u16 req_id; | ||
2694 | u32 req_slot; | ||
2695 | struct skd_request_context *skreq; | ||
2696 | u16 cmp_cntxt = 0; | ||
2697 | u8 cmp_status = 0; | ||
2698 | u8 cmp_cycle = 0; | ||
2699 | u32 cmp_bytes = 0; | ||
2700 | int rc = 0; | ||
2701 | int processed = 0; | ||
2702 | |||
2703 | for (;; ) { | ||
2704 | SKD_ASSERT(skdev->skcomp_ix < SKD_N_COMPLETION_ENTRY); | ||
2705 | |||
2706 | skcmp = &skdev->skcomp_table[skdev->skcomp_ix]; | ||
2707 | cmp_cycle = skcmp->cycle; | ||
2708 | cmp_cntxt = skcmp->tag; | ||
2709 | cmp_status = skcmp->status; | ||
2710 | cmp_bytes = be32_to_cpu(skcmp->num_returned_bytes); | ||
2711 | |||
2712 | skerr = &skdev->skerr_table[skdev->skcomp_ix]; | ||
2713 | |||
2714 | pr_debug("%s:%s:%d " | ||
2715 | "cycle=%d ix=%d got cycle=%d cmdctxt=0x%x stat=%d " | ||
2716 | "busy=%d rbytes=0x%x proto=%d\n", | ||
2717 | skdev->name, __func__, __LINE__, skdev->skcomp_cycle, | ||
2718 | skdev->skcomp_ix, cmp_cycle, cmp_cntxt, cmp_status, | ||
2719 | skdev->in_flight, cmp_bytes, skdev->proto_ver); | ||
2720 | |||
2721 | if (cmp_cycle != skdev->skcomp_cycle) { | ||
2722 | pr_debug("%s:%s:%d end of completions\n", | ||
2723 | skdev->name, __func__, __LINE__); | ||
2724 | break; | ||
2725 | } | ||
2726 | /* | ||
2727 | * Update the completion queue head index and possibly | ||
2728 | * the completion cycle count. 8-bit wrap-around. | ||
2729 | */ | ||
2730 | skdev->skcomp_ix++; | ||
2731 | if (skdev->skcomp_ix >= SKD_N_COMPLETION_ENTRY) { | ||
2732 | skdev->skcomp_ix = 0; | ||
2733 | skdev->skcomp_cycle++; | ||
2734 | } | ||
2735 | |||
2736 | /* | ||
2737 | * The command context is a unique 32-bit ID. The low order | ||
2738 | * bits help locate the request. The request is usually a | ||
2739 | * r/w request (see skd_start() above) or a special request. | ||
2740 | */ | ||
2741 | req_id = cmp_cntxt; | ||
2742 | req_slot = req_id & SKD_ID_SLOT_AND_TABLE_MASK; | ||
2743 | |||
2744 | /* Is this other than a r/w request? */ | ||
2745 | if (req_slot >= skdev->num_req_context) { | ||
2746 | /* | ||
2747 | * This is not a completion for a r/w request. | ||
2748 | */ | ||
2749 | skd_complete_other(skdev, skcmp, skerr); | ||
2750 | continue; | ||
2751 | } | ||
2752 | |||
2753 | skreq = &skdev->skreq_table[req_slot]; | ||
2754 | |||
2755 | /* | ||
2756 | * Make sure the request ID for the slot matches. | ||
2757 | */ | ||
2758 | if (skreq->id != req_id) { | ||
2759 | pr_debug("%s:%s:%d mismatch comp_id=0x%x req_id=0x%x\n", | ||
2760 | skdev->name, __func__, __LINE__, | ||
2761 | req_id, skreq->id); | ||
2762 | { | ||
2763 | u16 new_id = cmp_cntxt; | ||
2764 | pr_err("(%s): Completion mismatch " | ||
2765 | "comp_id=0x%04x skreq=0x%04x new=0x%04x\n", | ||
2766 | skd_name(skdev), req_id, | ||
2767 | skreq->id, new_id); | ||
2768 | |||
2769 | continue; | ||
2770 | } | ||
2771 | } | ||
2772 | |||
2773 | SKD_ASSERT(skreq->state == SKD_REQ_STATE_BUSY); | ||
2774 | |||
2775 | if (skreq->state == SKD_REQ_STATE_ABORTED) { | ||
2776 | pr_debug("%s:%s:%d reclaim req %p id=%04x\n", | ||
2777 | skdev->name, __func__, __LINE__, | ||
2778 | skreq, skreq->id); | ||
2779 | /* a previously timed out command can | ||
2780 | * now be cleaned up */ | ||
2781 | skd_release_skreq(skdev, skreq); | ||
2782 | continue; | ||
2783 | } | ||
2784 | |||
2785 | skreq->completion = *skcmp; | ||
2786 | if (unlikely(cmp_status == SAM_STAT_CHECK_CONDITION)) { | ||
2787 | skreq->err_info = *skerr; | ||
2788 | skd_log_check_status(skdev, cmp_status, skerr->key, | ||
2789 | skerr->code, skerr->qual, | ||
2790 | skerr->fruc); | ||
2791 | } | ||
2792 | /* Release DMA resources for the request. */ | ||
2793 | if (skreq->n_sg > 0) | ||
2794 | skd_postop_sg_list(skdev, skreq); | ||
2795 | |||
2796 | if (!skreq->req) { | ||
2797 | pr_debug("%s:%s:%d NULL backptr skdreq %p, " | ||
2798 | "req=0x%x req_id=0x%x\n", | ||
2799 | skdev->name, __func__, __LINE__, | ||
2800 | skreq, skreq->id, req_id); | ||
2801 | } else { | ||
2802 | /* | ||
2803 | * Capture the outcome and post it back to the | ||
2804 | * native request. | ||
2805 | */ | ||
2806 | if (likely(cmp_status == SAM_STAT_GOOD)) | ||
2807 | skd_end_request(skdev, skreq, 0); | ||
2808 | else | ||
2809 | skd_resolve_req_exception(skdev, skreq); | ||
2810 | } | ||
2811 | |||
2812 | /* | ||
2813 | * Release the skreq, its FIT msg (if one), timeout slot, | ||
2814 | * and queue depth. | ||
2815 | */ | ||
2816 | skd_release_skreq(skdev, skreq); | ||
2817 | |||
2818 | /* skd_isr_comp_limit equal zero means no limit */ | ||
2819 | if (limit) { | ||
2820 | if (++processed >= limit) { | ||
2821 | rc = 1; | ||
2822 | break; | ||
2823 | } | ||
2824 | } | ||
2825 | } | ||
2826 | |||
2827 | if ((skdev->state == SKD_DRVR_STATE_PAUSING) | ||
2828 | && (skdev->in_flight) == 0) { | ||
2829 | skdev->state = SKD_DRVR_STATE_PAUSED; | ||
2830 | wake_up_interruptible(&skdev->waitq); | ||
2831 | } | ||
2832 | |||
2833 | return rc; | ||
2834 | } | ||
2835 | |||
2836 | static void skd_complete_other(struct skd_device *skdev, | ||
2837 | volatile struct fit_completion_entry_v1 *skcomp, | ||
2838 | volatile struct fit_comp_error_info *skerr) | ||
2839 | { | ||
2840 | u32 req_id = 0; | ||
2841 | u32 req_table; | ||
2842 | u32 req_slot; | ||
2843 | struct skd_special_context *skspcl; | ||
2844 | |||
2845 | req_id = skcomp->tag; | ||
2846 | req_table = req_id & SKD_ID_TABLE_MASK; | ||
2847 | req_slot = req_id & SKD_ID_SLOT_MASK; | ||
2848 | |||
2849 | pr_debug("%s:%s:%d table=0x%x id=0x%x slot=%d\n", | ||
2850 | skdev->name, __func__, __LINE__, | ||
2851 | req_table, req_id, req_slot); | ||
2852 | |||
2853 | /* | ||
2854 | * Based on the request id, determine how to dispatch this completion. | ||
2855 | * This swich/case is finding the good cases and forwarding the | ||
2856 | * completion entry. Errors are reported below the switch. | ||
2857 | */ | ||
2858 | switch (req_table) { | ||
2859 | case SKD_ID_RW_REQUEST: | ||
2860 | /* | ||
2861 | * The caller, skd_completion_posted_isr() above, | ||
2862 | * handles r/w requests. The only way we get here | ||
2863 | * is if the req_slot is out of bounds. | ||
2864 | */ | ||
2865 | break; | ||
2866 | |||
2867 | case SKD_ID_SPECIAL_REQUEST: | ||
2868 | /* | ||
2869 | * Make sure the req_slot is in bounds and that the id | ||
2870 | * matches. | ||
2871 | */ | ||
2872 | if (req_slot < skdev->n_special) { | ||
2873 | skspcl = &skdev->skspcl_table[req_slot]; | ||
2874 | if (skspcl->req.id == req_id && | ||
2875 | skspcl->req.state == SKD_REQ_STATE_BUSY) { | ||
2876 | skd_complete_special(skdev, | ||
2877 | skcomp, skerr, skspcl); | ||
2878 | return; | ||
2879 | } | ||
2880 | } | ||
2881 | break; | ||
2882 | |||
2883 | case SKD_ID_INTERNAL: | ||
2884 | if (req_slot == 0) { | ||
2885 | skspcl = &skdev->internal_skspcl; | ||
2886 | if (skspcl->req.id == req_id && | ||
2887 | skspcl->req.state == SKD_REQ_STATE_BUSY) { | ||
2888 | skd_complete_internal(skdev, | ||
2889 | skcomp, skerr, skspcl); | ||
2890 | return; | ||
2891 | } | ||
2892 | } | ||
2893 | break; | ||
2894 | |||
2895 | case SKD_ID_FIT_MSG: | ||
2896 | /* | ||
2897 | * These id's should never appear in a completion record. | ||
2898 | */ | ||
2899 | break; | ||
2900 | |||
2901 | default: | ||
2902 | /* | ||
2903 | * These id's should never appear anywhere; | ||
2904 | */ | ||
2905 | break; | ||
2906 | } | ||
2907 | |||
2908 | /* | ||
2909 | * If we get here it is a bad or stale id. | ||
2910 | */ | ||
2911 | } | ||
2912 | |||
2913 | static void skd_complete_special(struct skd_device *skdev, | ||
2914 | volatile struct fit_completion_entry_v1 | ||
2915 | *skcomp, | ||
2916 | volatile struct fit_comp_error_info *skerr, | ||
2917 | struct skd_special_context *skspcl) | ||
2918 | { | ||
2919 | pr_debug("%s:%s:%d completing special request %p\n", | ||
2920 | skdev->name, __func__, __LINE__, skspcl); | ||
2921 | if (skspcl->orphaned) { | ||
2922 | /* Discard orphaned request */ | ||
2923 | /* ?: Can this release directly or does it need | ||
2924 | * to use a worker? */ | ||
2925 | pr_debug("%s:%s:%d release orphaned %p\n", | ||
2926 | skdev->name, __func__, __LINE__, skspcl); | ||
2927 | skd_release_special(skdev, skspcl); | ||
2928 | return; | ||
2929 | } | ||
2930 | |||
2931 | skd_process_scsi_inq(skdev, skcomp, skerr, skspcl); | ||
2932 | |||
2933 | skspcl->req.state = SKD_REQ_STATE_COMPLETED; | ||
2934 | skspcl->req.completion = *skcomp; | ||
2935 | skspcl->req.err_info = *skerr; | ||
2936 | |||
2937 | skd_log_check_status(skdev, skspcl->req.completion.status, skerr->key, | ||
2938 | skerr->code, skerr->qual, skerr->fruc); | ||
2939 | |||
2940 | wake_up_interruptible(&skdev->waitq); | ||
2941 | } | ||
2942 | |||
2943 | /* assume spinlock is already held */ | ||
2944 | static void skd_release_special(struct skd_device *skdev, | ||
2945 | struct skd_special_context *skspcl) | ||
2946 | { | ||
2947 | int i, was_depleted; | ||
2948 | |||
2949 | for (i = 0; i < skspcl->req.n_sg; i++) { | ||
2950 | struct page *page = sg_page(&skspcl->req.sg[i]); | ||
2951 | __free_page(page); | ||
2952 | } | ||
2953 | |||
2954 | was_depleted = (skdev->skspcl_free_list == NULL); | ||
2955 | |||
2956 | skspcl->req.state = SKD_REQ_STATE_IDLE; | ||
2957 | skspcl->req.id += SKD_ID_INCR; | ||
2958 | skspcl->req.next = | ||
2959 | (struct skd_request_context *)skdev->skspcl_free_list; | ||
2960 | skdev->skspcl_free_list = (struct skd_special_context *)skspcl; | ||
2961 | |||
2962 | if (was_depleted) { | ||
2963 | pr_debug("%s:%s:%d skspcl was depleted\n", | ||
2964 | skdev->name, __func__, __LINE__); | ||
2965 | /* Free list was depleted. Their might be waiters. */ | ||
2966 | wake_up_interruptible(&skdev->waitq); | ||
2967 | } | ||
2968 | } | ||
2969 | |||
2970 | static void skd_reset_skcomp(struct skd_device *skdev) | ||
2971 | { | ||
2972 | u32 nbytes; | ||
2973 | struct fit_completion_entry_v1 *skcomp; | ||
2974 | |||
2975 | nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY; | ||
2976 | nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY; | ||
2977 | |||
2978 | memset(skdev->skcomp_table, 0, nbytes); | ||
2979 | |||
2980 | skdev->skcomp_ix = 0; | ||
2981 | skdev->skcomp_cycle = 1; | ||
2982 | } | ||
2983 | |||
2984 | /* | ||
2985 | ***************************************************************************** | ||
2986 | * INTERRUPTS | ||
2987 | ***************************************************************************** | ||
2988 | */ | ||
2989 | static void skd_completion_worker(struct work_struct *work) | ||
2990 | { | ||
2991 | struct skd_device *skdev = | ||
2992 | container_of(work, struct skd_device, completion_worker); | ||
2993 | unsigned long flags; | ||
2994 | int flush_enqueued = 0; | ||
2995 | |||
2996 | spin_lock_irqsave(&skdev->lock, flags); | ||
2997 | |||
2998 | /* | ||
2999 | * pass in limit=0, which means no limit.. | ||
3000 | * process everything in compq | ||
3001 | */ | ||
3002 | skd_isr_completion_posted(skdev, 0, &flush_enqueued); | ||
3003 | skd_request_fn(skdev->queue); | ||
3004 | |||
3005 | spin_unlock_irqrestore(&skdev->lock, flags); | ||
3006 | } | ||
3007 | |||
3008 | static void skd_isr_msg_from_dev(struct skd_device *skdev); | ||
3009 | |||
3010 | irqreturn_t | ||
3011 | static skd_isr(int irq, void *ptr) | ||
3012 | { | ||
3013 | struct skd_device *skdev; | ||
3014 | u32 intstat; | ||
3015 | u32 ack; | ||
3016 | int rc = 0; | ||
3017 | int deferred = 0; | ||
3018 | int flush_enqueued = 0; | ||
3019 | |||
3020 | skdev = (struct skd_device *)ptr; | ||
3021 | spin_lock(&skdev->lock); | ||
3022 | |||
3023 | for (;; ) { | ||
3024 | intstat = SKD_READL(skdev, FIT_INT_STATUS_HOST); | ||
3025 | |||
3026 | ack = FIT_INT_DEF_MASK; | ||
3027 | ack &= intstat; | ||
3028 | |||
3029 | pr_debug("%s:%s:%d intstat=0x%x ack=0x%x\n", | ||
3030 | skdev->name, __func__, __LINE__, intstat, ack); | ||
3031 | |||
3032 | /* As long as there is an int pending on device, keep | ||
3033 | * running loop. When none, get out, but if we've never | ||
3034 | * done any processing, call completion handler? | ||
3035 | */ | ||
3036 | if (ack == 0) { | ||
3037 | /* No interrupts on device, but run the completion | ||
3038 | * processor anyway? | ||
3039 | */ | ||
3040 | if (rc == 0) | ||
3041 | if (likely (skdev->state | ||
3042 | == SKD_DRVR_STATE_ONLINE)) | ||
3043 | deferred = 1; | ||
3044 | break; | ||
3045 | } | ||
3046 | |||
3047 | rc = IRQ_HANDLED; | ||
3048 | |||
3049 | SKD_WRITEL(skdev, ack, FIT_INT_STATUS_HOST); | ||
3050 | |||
3051 | if (likely((skdev->state != SKD_DRVR_STATE_LOAD) && | ||
3052 | (skdev->state != SKD_DRVR_STATE_STOPPING))) { | ||
3053 | if (intstat & FIT_ISH_COMPLETION_POSTED) { | ||
3054 | /* | ||
3055 | * If we have already deferred completion | ||
3056 | * processing, don't bother running it again | ||
3057 | */ | ||
3058 | if (deferred == 0) | ||
3059 | deferred = | ||
3060 | skd_isr_completion_posted(skdev, | ||
3061 | skd_isr_comp_limit, &flush_enqueued); | ||
3062 | } | ||
3063 | |||
3064 | if (intstat & FIT_ISH_FW_STATE_CHANGE) { | ||
3065 | skd_isr_fwstate(skdev); | ||
3066 | if (skdev->state == SKD_DRVR_STATE_FAULT || | ||
3067 | skdev->state == | ||
3068 | SKD_DRVR_STATE_DISAPPEARED) { | ||
3069 | spin_unlock(&skdev->lock); | ||
3070 | return rc; | ||
3071 | } | ||
3072 | } | ||
3073 | |||
3074 | if (intstat & FIT_ISH_MSG_FROM_DEV) | ||
3075 | skd_isr_msg_from_dev(skdev); | ||
3076 | } | ||
3077 | } | ||
3078 | |||
3079 | if (unlikely(flush_enqueued)) | ||
3080 | skd_request_fn(skdev->queue); | ||
3081 | |||
3082 | if (deferred) | ||
3083 | schedule_work(&skdev->completion_worker); | ||
3084 | else if (!flush_enqueued) | ||
3085 | skd_request_fn(skdev->queue); | ||
3086 | |||
3087 | spin_unlock(&skdev->lock); | ||
3088 | |||
3089 | return rc; | ||
3090 | } | ||
3091 | |||
3092 | static void skd_drive_fault(struct skd_device *skdev) | ||
3093 | { | ||
3094 | skdev->state = SKD_DRVR_STATE_FAULT; | ||
3095 | pr_err("(%s): Drive FAULT\n", skd_name(skdev)); | ||
3096 | } | ||
3097 | |||
3098 | static void skd_drive_disappeared(struct skd_device *skdev) | ||
3099 | { | ||
3100 | skdev->state = SKD_DRVR_STATE_DISAPPEARED; | ||
3101 | pr_err("(%s): Drive DISAPPEARED\n", skd_name(skdev)); | ||
3102 | } | ||
3103 | |||
3104 | static void skd_isr_fwstate(struct skd_device *skdev) | ||
3105 | { | ||
3106 | u32 sense; | ||
3107 | u32 state; | ||
3108 | u32 mtd; | ||
3109 | int prev_driver_state = skdev->state; | ||
3110 | |||
3111 | sense = SKD_READL(skdev, FIT_STATUS); | ||
3112 | state = sense & FIT_SR_DRIVE_STATE_MASK; | ||
3113 | |||
3114 | pr_err("(%s): s1120 state %s(%d)=>%s(%d)\n", | ||
3115 | skd_name(skdev), | ||
3116 | skd_drive_state_to_str(skdev->drive_state), skdev->drive_state, | ||
3117 | skd_drive_state_to_str(state), state); | ||
3118 | |||
3119 | skdev->drive_state = state; | ||
3120 | |||
3121 | switch (skdev->drive_state) { | ||
3122 | case FIT_SR_DRIVE_INIT: | ||
3123 | if (skdev->state == SKD_DRVR_STATE_PROTOCOL_MISMATCH) { | ||
3124 | skd_disable_interrupts(skdev); | ||
3125 | break; | ||
3126 | } | ||
3127 | if (skdev->state == SKD_DRVR_STATE_RESTARTING) | ||
3128 | skd_recover_requests(skdev, 0); | ||
3129 | if (skdev->state == SKD_DRVR_STATE_WAIT_BOOT) { | ||
3130 | skdev->timer_countdown = SKD_STARTING_TIMO; | ||
3131 | skdev->state = SKD_DRVR_STATE_STARTING; | ||
3132 | skd_soft_reset(skdev); | ||
3133 | break; | ||
3134 | } | ||
3135 | mtd = FIT_MXD_CONS(FIT_MTD_FITFW_INIT, 0, 0); | ||
3136 | SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); | ||
3137 | skdev->last_mtd = mtd; | ||
3138 | break; | ||
3139 | |||
3140 | case FIT_SR_DRIVE_ONLINE: | ||
3141 | skdev->cur_max_queue_depth = skd_max_queue_depth; | ||
3142 | if (skdev->cur_max_queue_depth > skdev->dev_max_queue_depth) | ||
3143 | skdev->cur_max_queue_depth = skdev->dev_max_queue_depth; | ||
3144 | |||
3145 | skdev->queue_low_water_mark = | ||
3146 | skdev->cur_max_queue_depth * 2 / 3 + 1; | ||
3147 | if (skdev->queue_low_water_mark < 1) | ||
3148 | skdev->queue_low_water_mark = 1; | ||
3149 | pr_info( | ||
3150 | "(%s): Queue depth limit=%d dev=%d lowat=%d\n", | ||
3151 | skd_name(skdev), | ||
3152 | skdev->cur_max_queue_depth, | ||
3153 | skdev->dev_max_queue_depth, skdev->queue_low_water_mark); | ||
3154 | |||
3155 | skd_refresh_device_data(skdev); | ||
3156 | break; | ||
3157 | |||
3158 | case FIT_SR_DRIVE_BUSY: | ||
3159 | skdev->state = SKD_DRVR_STATE_BUSY; | ||
3160 | skdev->timer_countdown = SKD_BUSY_TIMO; | ||
3161 | skd_quiesce_dev(skdev); | ||
3162 | break; | ||
3163 | case FIT_SR_DRIVE_BUSY_SANITIZE: | ||
3164 | /* set timer for 3 seconds, we'll abort any unfinished | ||
3165 | * commands after that expires | ||
3166 | */ | ||
3167 | skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE; | ||
3168 | skdev->timer_countdown = SKD_TIMER_SECONDS(3); | ||
3169 | blk_start_queue(skdev->queue); | ||
3170 | break; | ||
3171 | case FIT_SR_DRIVE_BUSY_ERASE: | ||
3172 | skdev->state = SKD_DRVR_STATE_BUSY_ERASE; | ||
3173 | skdev->timer_countdown = SKD_BUSY_TIMO; | ||
3174 | break; | ||
3175 | case FIT_SR_DRIVE_OFFLINE: | ||
3176 | skdev->state = SKD_DRVR_STATE_IDLE; | ||
3177 | break; | ||
3178 | case FIT_SR_DRIVE_SOFT_RESET: | ||
3179 | switch (skdev->state) { | ||
3180 | case SKD_DRVR_STATE_STARTING: | ||
3181 | case SKD_DRVR_STATE_RESTARTING: | ||
3182 | /* Expected by a caller of skd_soft_reset() */ | ||
3183 | break; | ||
3184 | default: | ||
3185 | skdev->state = SKD_DRVR_STATE_RESTARTING; | ||
3186 | break; | ||
3187 | } | ||
3188 | break; | ||
3189 | case FIT_SR_DRIVE_FW_BOOTING: | ||
3190 | pr_debug("%s:%s:%d ISR FIT_SR_DRIVE_FW_BOOTING %s\n", | ||
3191 | skdev->name, __func__, __LINE__, skdev->name); | ||
3192 | skdev->state = SKD_DRVR_STATE_WAIT_BOOT; | ||
3193 | skdev->timer_countdown = SKD_WAIT_BOOT_TIMO; | ||
3194 | break; | ||
3195 | |||
3196 | case FIT_SR_DRIVE_DEGRADED: | ||
3197 | case FIT_SR_PCIE_LINK_DOWN: | ||
3198 | case FIT_SR_DRIVE_NEED_FW_DOWNLOAD: | ||
3199 | break; | ||
3200 | |||
3201 | case FIT_SR_DRIVE_FAULT: | ||
3202 | skd_drive_fault(skdev); | ||
3203 | skd_recover_requests(skdev, 0); | ||
3204 | blk_start_queue(skdev->queue); | ||
3205 | break; | ||
3206 | |||
3207 | /* PCIe bus returned all Fs? */ | ||
3208 | case 0xFF: | ||
3209 | pr_info("(%s): state=0x%x sense=0x%x\n", | ||
3210 | skd_name(skdev), state, sense); | ||
3211 | skd_drive_disappeared(skdev); | ||
3212 | skd_recover_requests(skdev, 0); | ||
3213 | blk_start_queue(skdev->queue); | ||
3214 | break; | ||
3215 | default: | ||
3216 | /* | ||
3217 | * Uknown FW State. Wait for a state we recognize. | ||
3218 | */ | ||
3219 | break; | ||
3220 | } | ||
3221 | pr_err("(%s): Driver state %s(%d)=>%s(%d)\n", | ||
3222 | skd_name(skdev), | ||
3223 | skd_skdev_state_to_str(prev_driver_state), prev_driver_state, | ||
3224 | skd_skdev_state_to_str(skdev->state), skdev->state); | ||
3225 | } | ||
3226 | |||
3227 | static void skd_recover_requests(struct skd_device *skdev, int requeue) | ||
3228 | { | ||
3229 | int i; | ||
3230 | |||
3231 | for (i = 0; i < skdev->num_req_context; i++) { | ||
3232 | struct skd_request_context *skreq = &skdev->skreq_table[i]; | ||
3233 | |||
3234 | if (skreq->state == SKD_REQ_STATE_BUSY) { | ||
3235 | skd_log_skreq(skdev, skreq, "recover"); | ||
3236 | |||
3237 | SKD_ASSERT((skreq->id & SKD_ID_INCR) != 0); | ||
3238 | SKD_ASSERT(skreq->req != NULL); | ||
3239 | |||
3240 | /* Release DMA resources for the request. */ | ||
3241 | if (skreq->n_sg > 0) | ||
3242 | skd_postop_sg_list(skdev, skreq); | ||
3243 | |||
3244 | if (requeue && | ||
3245 | (unsigned long) ++skreq->req->special < | ||
3246 | SKD_MAX_RETRIES) | ||
3247 | blk_requeue_request(skdev->queue, skreq->req); | ||
3248 | else | ||
3249 | skd_end_request(skdev, skreq, -EIO); | ||
3250 | |||
3251 | skreq->req = NULL; | ||
3252 | |||
3253 | skreq->state = SKD_REQ_STATE_IDLE; | ||
3254 | skreq->id += SKD_ID_INCR; | ||
3255 | } | ||
3256 | if (i > 0) | ||
3257 | skreq[-1].next = skreq; | ||
3258 | skreq->next = NULL; | ||
3259 | } | ||
3260 | skdev->skreq_free_list = skdev->skreq_table; | ||
3261 | |||
3262 | for (i = 0; i < skdev->num_fitmsg_context; i++) { | ||
3263 | struct skd_fitmsg_context *skmsg = &skdev->skmsg_table[i]; | ||
3264 | |||
3265 | if (skmsg->state == SKD_MSG_STATE_BUSY) { | ||
3266 | skd_log_skmsg(skdev, skmsg, "salvaged"); | ||
3267 | SKD_ASSERT((skmsg->id & SKD_ID_INCR) != 0); | ||
3268 | skmsg->state = SKD_MSG_STATE_IDLE; | ||
3269 | skmsg->id += SKD_ID_INCR; | ||
3270 | } | ||
3271 | if (i > 0) | ||
3272 | skmsg[-1].next = skmsg; | ||
3273 | skmsg->next = NULL; | ||
3274 | } | ||
3275 | skdev->skmsg_free_list = skdev->skmsg_table; | ||
3276 | |||
3277 | for (i = 0; i < skdev->n_special; i++) { | ||
3278 | struct skd_special_context *skspcl = &skdev->skspcl_table[i]; | ||
3279 | |||
3280 | /* If orphaned, reclaim it because it has already been reported | ||
3281 | * to the process as an error (it was just waiting for | ||
3282 | * a completion that didn't come, and now it will never come) | ||
3283 | * If busy, change to a state that will cause it to error | ||
3284 | * out in the wait routine and let it do the normal | ||
3285 | * reporting and reclaiming | ||
3286 | */ | ||
3287 | if (skspcl->req.state == SKD_REQ_STATE_BUSY) { | ||
3288 | if (skspcl->orphaned) { | ||
3289 | pr_debug("%s:%s:%d orphaned %p\n", | ||
3290 | skdev->name, __func__, __LINE__, | ||
3291 | skspcl); | ||
3292 | skd_release_special(skdev, skspcl); | ||
3293 | } else { | ||
3294 | pr_debug("%s:%s:%d not orphaned %p\n", | ||
3295 | skdev->name, __func__, __LINE__, | ||
3296 | skspcl); | ||
3297 | skspcl->req.state = SKD_REQ_STATE_ABORTED; | ||
3298 | } | ||
3299 | } | ||
3300 | } | ||
3301 | skdev->skspcl_free_list = skdev->skspcl_table; | ||
3302 | |||
3303 | for (i = 0; i < SKD_N_TIMEOUT_SLOT; i++) | ||
3304 | skdev->timeout_slot[i] = 0; | ||
3305 | |||
3306 | skdev->in_flight = 0; | ||
3307 | } | ||
3308 | |||
3309 | static void skd_isr_msg_from_dev(struct skd_device *skdev) | ||
3310 | { | ||
3311 | u32 mfd; | ||
3312 | u32 mtd; | ||
3313 | u32 data; | ||
3314 | |||
3315 | mfd = SKD_READL(skdev, FIT_MSG_FROM_DEVICE); | ||
3316 | |||
3317 | pr_debug("%s:%s:%d mfd=0x%x last_mtd=0x%x\n", | ||
3318 | skdev->name, __func__, __LINE__, mfd, skdev->last_mtd); | ||
3319 | |||
3320 | /* ignore any mtd that is an ack for something we didn't send */ | ||
3321 | if (FIT_MXD_TYPE(mfd) != FIT_MXD_TYPE(skdev->last_mtd)) | ||
3322 | return; | ||
3323 | |||
3324 | switch (FIT_MXD_TYPE(mfd)) { | ||
3325 | case FIT_MTD_FITFW_INIT: | ||
3326 | skdev->proto_ver = FIT_PROTOCOL_MAJOR_VER(mfd); | ||
3327 | |||
3328 | if (skdev->proto_ver != FIT_PROTOCOL_VERSION_1) { | ||
3329 | pr_err("(%s): protocol mismatch\n", | ||
3330 | skdev->name); | ||
3331 | pr_err("(%s): got=%d support=%d\n", | ||
3332 | skdev->name, skdev->proto_ver, | ||
3333 | FIT_PROTOCOL_VERSION_1); | ||
3334 | pr_err("(%s): please upgrade driver\n", | ||
3335 | skdev->name); | ||
3336 | skdev->state = SKD_DRVR_STATE_PROTOCOL_MISMATCH; | ||
3337 | skd_soft_reset(skdev); | ||
3338 | break; | ||
3339 | } | ||
3340 | mtd = FIT_MXD_CONS(FIT_MTD_GET_CMDQ_DEPTH, 0, 0); | ||
3341 | SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); | ||
3342 | skdev->last_mtd = mtd; | ||
3343 | break; | ||
3344 | |||
3345 | case FIT_MTD_GET_CMDQ_DEPTH: | ||
3346 | skdev->dev_max_queue_depth = FIT_MXD_DATA(mfd); | ||
3347 | mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_DEPTH, 0, | ||
3348 | SKD_N_COMPLETION_ENTRY); | ||
3349 | SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); | ||
3350 | skdev->last_mtd = mtd; | ||
3351 | break; | ||
3352 | |||
3353 | case FIT_MTD_SET_COMPQ_DEPTH: | ||
3354 | SKD_WRITEQ(skdev, skdev->cq_dma_address, FIT_MSG_TO_DEVICE_ARG); | ||
3355 | mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_ADDR, 0, 0); | ||
3356 | SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); | ||
3357 | skdev->last_mtd = mtd; | ||
3358 | break; | ||
3359 | |||
3360 | case FIT_MTD_SET_COMPQ_ADDR: | ||
3361 | skd_reset_skcomp(skdev); | ||
3362 | mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_HOST_ID, 0, skdev->devno); | ||
3363 | SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); | ||
3364 | skdev->last_mtd = mtd; | ||
3365 | break; | ||
3366 | |||
3367 | case FIT_MTD_CMD_LOG_HOST_ID: | ||
3368 | skdev->connect_time_stamp = get_seconds(); | ||
3369 | data = skdev->connect_time_stamp & 0xFFFF; | ||
3370 | mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data); | ||
3371 | SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); | ||
3372 | skdev->last_mtd = mtd; | ||
3373 | break; | ||
3374 | |||
3375 | case FIT_MTD_CMD_LOG_TIME_STAMP_LO: | ||
3376 | skdev->drive_jiffies = FIT_MXD_DATA(mfd); | ||
3377 | data = (skdev->connect_time_stamp >> 16) & 0xFFFF; | ||
3378 | mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_HI, 0, data); | ||
3379 | SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); | ||
3380 | skdev->last_mtd = mtd; | ||
3381 | break; | ||
3382 | |||
3383 | case FIT_MTD_CMD_LOG_TIME_STAMP_HI: | ||
3384 | skdev->drive_jiffies |= (FIT_MXD_DATA(mfd) << 16); | ||
3385 | mtd = FIT_MXD_CONS(FIT_MTD_ARM_QUEUE, 0, 0); | ||
3386 | SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); | ||
3387 | skdev->last_mtd = mtd; | ||
3388 | |||
3389 | pr_err("(%s): Time sync driver=0x%x device=0x%x\n", | ||
3390 | skd_name(skdev), | ||
3391 | skdev->connect_time_stamp, skdev->drive_jiffies); | ||
3392 | break; | ||
3393 | |||
3394 | case FIT_MTD_ARM_QUEUE: | ||
3395 | skdev->last_mtd = 0; | ||
3396 | /* | ||
3397 | * State should be, or soon will be, FIT_SR_DRIVE_ONLINE. | ||
3398 | */ | ||
3399 | break; | ||
3400 | |||
3401 | default: | ||
3402 | break; | ||
3403 | } | ||
3404 | } | ||
3405 | |||
3406 | static void skd_disable_interrupts(struct skd_device *skdev) | ||
3407 | { | ||
3408 | u32 sense; | ||
3409 | |||
3410 | sense = SKD_READL(skdev, FIT_CONTROL); | ||
3411 | sense &= ~FIT_CR_ENABLE_INTERRUPTS; | ||
3412 | SKD_WRITEL(skdev, sense, FIT_CONTROL); | ||
3413 | pr_debug("%s:%s:%d sense 0x%x\n", | ||
3414 | skdev->name, __func__, __LINE__, sense); | ||
3415 | |||
3416 | /* Note that the 1s is written. A 1-bit means | ||
3417 | * disable, a 0 means enable. | ||
3418 | */ | ||
3419 | SKD_WRITEL(skdev, ~0, FIT_INT_MASK_HOST); | ||
3420 | } | ||
3421 | |||
3422 | static void skd_enable_interrupts(struct skd_device *skdev) | ||
3423 | { | ||
3424 | u32 val; | ||
3425 | |||
3426 | /* unmask interrupts first */ | ||
3427 | val = FIT_ISH_FW_STATE_CHANGE + | ||
3428 | FIT_ISH_COMPLETION_POSTED + FIT_ISH_MSG_FROM_DEV; | ||
3429 | |||
3430 | /* Note that the compliment of mask is written. A 1-bit means | ||
3431 | * disable, a 0 means enable. */ | ||
3432 | SKD_WRITEL(skdev, ~val, FIT_INT_MASK_HOST); | ||
3433 | pr_debug("%s:%s:%d interrupt mask=0x%x\n", | ||
3434 | skdev->name, __func__, __LINE__, ~val); | ||
3435 | |||
3436 | val = SKD_READL(skdev, FIT_CONTROL); | ||
3437 | val |= FIT_CR_ENABLE_INTERRUPTS; | ||
3438 | pr_debug("%s:%s:%d control=0x%x\n", | ||
3439 | skdev->name, __func__, __LINE__, val); | ||
3440 | SKD_WRITEL(skdev, val, FIT_CONTROL); | ||
3441 | } | ||
3442 | |||
3443 | /* | ||
3444 | ***************************************************************************** | ||
3445 | * START, STOP, RESTART, QUIESCE, UNQUIESCE | ||
3446 | ***************************************************************************** | ||
3447 | */ | ||
3448 | |||
3449 | static void skd_soft_reset(struct skd_device *skdev) | ||
3450 | { | ||
3451 | u32 val; | ||
3452 | |||
3453 | val = SKD_READL(skdev, FIT_CONTROL); | ||
3454 | val |= (FIT_CR_SOFT_RESET); | ||
3455 | pr_debug("%s:%s:%d control=0x%x\n", | ||
3456 | skdev->name, __func__, __LINE__, val); | ||
3457 | SKD_WRITEL(skdev, val, FIT_CONTROL); | ||
3458 | } | ||
3459 | |||
3460 | static void skd_start_device(struct skd_device *skdev) | ||
3461 | { | ||
3462 | unsigned long flags; | ||
3463 | u32 sense; | ||
3464 | u32 state; | ||
3465 | |||
3466 | spin_lock_irqsave(&skdev->lock, flags); | ||
3467 | |||
3468 | /* ack all ghost interrupts */ | ||
3469 | SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST); | ||
3470 | |||
3471 | sense = SKD_READL(skdev, FIT_STATUS); | ||
3472 | |||
3473 | pr_debug("%s:%s:%d initial status=0x%x\n", | ||
3474 | skdev->name, __func__, __LINE__, sense); | ||
3475 | |||
3476 | state = sense & FIT_SR_DRIVE_STATE_MASK; | ||
3477 | skdev->drive_state = state; | ||
3478 | skdev->last_mtd = 0; | ||
3479 | |||
3480 | skdev->state = SKD_DRVR_STATE_STARTING; | ||
3481 | skdev->timer_countdown = SKD_STARTING_TIMO; | ||
3482 | |||
3483 | skd_enable_interrupts(skdev); | ||
3484 | |||
3485 | switch (skdev->drive_state) { | ||
3486 | case FIT_SR_DRIVE_OFFLINE: | ||
3487 | pr_err("(%s): Drive offline...\n", skd_name(skdev)); | ||
3488 | break; | ||
3489 | |||
3490 | case FIT_SR_DRIVE_FW_BOOTING: | ||
3491 | pr_debug("%s:%s:%d FIT_SR_DRIVE_FW_BOOTING %s\n", | ||
3492 | skdev->name, __func__, __LINE__, skdev->name); | ||
3493 | skdev->state = SKD_DRVR_STATE_WAIT_BOOT; | ||
3494 | skdev->timer_countdown = SKD_WAIT_BOOT_TIMO; | ||
3495 | break; | ||
3496 | |||
3497 | case FIT_SR_DRIVE_BUSY_SANITIZE: | ||
3498 | pr_info("(%s): Start: BUSY_SANITIZE\n", | ||
3499 | skd_name(skdev)); | ||
3500 | skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE; | ||
3501 | skdev->timer_countdown = SKD_STARTED_BUSY_TIMO; | ||
3502 | break; | ||
3503 | |||
3504 | case FIT_SR_DRIVE_BUSY_ERASE: | ||
3505 | pr_info("(%s): Start: BUSY_ERASE\n", skd_name(skdev)); | ||
3506 | skdev->state = SKD_DRVR_STATE_BUSY_ERASE; | ||
3507 | skdev->timer_countdown = SKD_STARTED_BUSY_TIMO; | ||
3508 | break; | ||
3509 | |||
3510 | case FIT_SR_DRIVE_INIT: | ||
3511 | case FIT_SR_DRIVE_ONLINE: | ||
3512 | skd_soft_reset(skdev); | ||
3513 | break; | ||
3514 | |||
3515 | case FIT_SR_DRIVE_BUSY: | ||
3516 | pr_err("(%s): Drive Busy...\n", skd_name(skdev)); | ||
3517 | skdev->state = SKD_DRVR_STATE_BUSY; | ||
3518 | skdev->timer_countdown = SKD_STARTED_BUSY_TIMO; | ||
3519 | break; | ||
3520 | |||
3521 | case FIT_SR_DRIVE_SOFT_RESET: | ||
3522 | pr_err("(%s) drive soft reset in prog\n", | ||
3523 | skd_name(skdev)); | ||
3524 | break; | ||
3525 | |||
3526 | case FIT_SR_DRIVE_FAULT: | ||
3527 | /* Fault state is bad...soft reset won't do it... | ||
3528 | * Hard reset, maybe, but does it work on device? | ||
3529 | * For now, just fault so the system doesn't hang. | ||
3530 | */ | ||
3531 | skd_drive_fault(skdev); | ||
3532 | /*start the queue so we can respond with error to requests */ | ||
3533 | pr_debug("%s:%s:%d starting %s queue\n", | ||
3534 | skdev->name, __func__, __LINE__, skdev->name); | ||
3535 | blk_start_queue(skdev->queue); | ||
3536 | skdev->gendisk_on = -1; | ||
3537 | wake_up_interruptible(&skdev->waitq); | ||
3538 | break; | ||
3539 | |||
3540 | case 0xFF: | ||
3541 | /* Most likely the device isn't there or isn't responding | ||
3542 | * to the BAR1 addresses. */ | ||
3543 | skd_drive_disappeared(skdev); | ||
3544 | /*start the queue so we can respond with error to requests */ | ||
3545 | pr_debug("%s:%s:%d starting %s queue to error-out reqs\n", | ||
3546 | skdev->name, __func__, __LINE__, skdev->name); | ||
3547 | blk_start_queue(skdev->queue); | ||
3548 | skdev->gendisk_on = -1; | ||
3549 | wake_up_interruptible(&skdev->waitq); | ||
3550 | break; | ||
3551 | |||
3552 | default: | ||
3553 | pr_err("(%s) Start: unknown state %x\n", | ||
3554 | skd_name(skdev), skdev->drive_state); | ||
3555 | break; | ||
3556 | } | ||
3557 | |||
3558 | state = SKD_READL(skdev, FIT_CONTROL); | ||
3559 | pr_debug("%s:%s:%d FIT Control Status=0x%x\n", | ||
3560 | skdev->name, __func__, __LINE__, state); | ||
3561 | |||
3562 | state = SKD_READL(skdev, FIT_INT_STATUS_HOST); | ||
3563 | pr_debug("%s:%s:%d Intr Status=0x%x\n", | ||
3564 | skdev->name, __func__, __LINE__, state); | ||
3565 | |||
3566 | state = SKD_READL(skdev, FIT_INT_MASK_HOST); | ||
3567 | pr_debug("%s:%s:%d Intr Mask=0x%x\n", | ||
3568 | skdev->name, __func__, __LINE__, state); | ||
3569 | |||
3570 | state = SKD_READL(skdev, FIT_MSG_FROM_DEVICE); | ||
3571 | pr_debug("%s:%s:%d Msg from Dev=0x%x\n", | ||
3572 | skdev->name, __func__, __LINE__, state); | ||
3573 | |||
3574 | state = SKD_READL(skdev, FIT_HW_VERSION); | ||
3575 | pr_debug("%s:%s:%d HW version=0x%x\n", | ||
3576 | skdev->name, __func__, __LINE__, state); | ||
3577 | |||
3578 | spin_unlock_irqrestore(&skdev->lock, flags); | ||
3579 | } | ||
3580 | |||
3581 | static void skd_stop_device(struct skd_device *skdev) | ||
3582 | { | ||
3583 | unsigned long flags; | ||
3584 | struct skd_special_context *skspcl = &skdev->internal_skspcl; | ||
3585 | u32 dev_state; | ||
3586 | int i; | ||
3587 | |||
3588 | spin_lock_irqsave(&skdev->lock, flags); | ||
3589 | |||
3590 | if (skdev->state != SKD_DRVR_STATE_ONLINE) { | ||
3591 | pr_err("(%s): skd_stop_device not online no sync\n", | ||
3592 | skd_name(skdev)); | ||
3593 | goto stop_out; | ||
3594 | } | ||
3595 | |||
3596 | if (skspcl->req.state != SKD_REQ_STATE_IDLE) { | ||
3597 | pr_err("(%s): skd_stop_device no special\n", | ||
3598 | skd_name(skdev)); | ||
3599 | goto stop_out; | ||
3600 | } | ||
3601 | |||
3602 | skdev->state = SKD_DRVR_STATE_SYNCING; | ||
3603 | skdev->sync_done = 0; | ||
3604 | |||
3605 | skd_send_internal_skspcl(skdev, skspcl, SYNCHRONIZE_CACHE); | ||
3606 | |||
3607 | spin_unlock_irqrestore(&skdev->lock, flags); | ||
3608 | |||
3609 | wait_event_interruptible_timeout(skdev->waitq, | ||
3610 | (skdev->sync_done), (10 * HZ)); | ||
3611 | |||
3612 | spin_lock_irqsave(&skdev->lock, flags); | ||
3613 | |||
3614 | switch (skdev->sync_done) { | ||
3615 | case 0: | ||
3616 | pr_err("(%s): skd_stop_device no sync\n", | ||
3617 | skd_name(skdev)); | ||
3618 | break; | ||
3619 | case 1: | ||
3620 | pr_err("(%s): skd_stop_device sync done\n", | ||
3621 | skd_name(skdev)); | ||
3622 | break; | ||
3623 | default: | ||
3624 | pr_err("(%s): skd_stop_device sync error\n", | ||
3625 | skd_name(skdev)); | ||
3626 | } | ||
3627 | |||
3628 | stop_out: | ||
3629 | skdev->state = SKD_DRVR_STATE_STOPPING; | ||
3630 | spin_unlock_irqrestore(&skdev->lock, flags); | ||
3631 | |||
3632 | skd_kill_timer(skdev); | ||
3633 | |||
3634 | spin_lock_irqsave(&skdev->lock, flags); | ||
3635 | skd_disable_interrupts(skdev); | ||
3636 | |||
3637 | /* ensure all ints on device are cleared */ | ||
3638 | /* soft reset the device to unload with a clean slate */ | ||
3639 | SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST); | ||
3640 | SKD_WRITEL(skdev, FIT_CR_SOFT_RESET, FIT_CONTROL); | ||
3641 | |||
3642 | spin_unlock_irqrestore(&skdev->lock, flags); | ||
3643 | |||
3644 | /* poll every 100ms, 1 second timeout */ | ||
3645 | for (i = 0; i < 10; i++) { | ||
3646 | dev_state = | ||
3647 | SKD_READL(skdev, FIT_STATUS) & FIT_SR_DRIVE_STATE_MASK; | ||
3648 | if (dev_state == FIT_SR_DRIVE_INIT) | ||
3649 | break; | ||
3650 | set_current_state(TASK_INTERRUPTIBLE); | ||
3651 | schedule_timeout(msecs_to_jiffies(100)); | ||
3652 | } | ||
3653 | |||
3654 | if (dev_state != FIT_SR_DRIVE_INIT) | ||
3655 | pr_err("(%s): skd_stop_device state error 0x%02x\n", | ||
3656 | skd_name(skdev), dev_state); | ||
3657 | } | ||
3658 | |||
3659 | /* assume spinlock is held */ | ||
3660 | static void skd_restart_device(struct skd_device *skdev) | ||
3661 | { | ||
3662 | u32 state; | ||
3663 | |||
3664 | /* ack all ghost interrupts */ | ||
3665 | SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST); | ||
3666 | |||
3667 | state = SKD_READL(skdev, FIT_STATUS); | ||
3668 | |||
3669 | pr_debug("%s:%s:%d drive status=0x%x\n", | ||
3670 | skdev->name, __func__, __LINE__, state); | ||
3671 | |||
3672 | state &= FIT_SR_DRIVE_STATE_MASK; | ||
3673 | skdev->drive_state = state; | ||
3674 | skdev->last_mtd = 0; | ||
3675 | |||
3676 | skdev->state = SKD_DRVR_STATE_RESTARTING; | ||
3677 | skdev->timer_countdown = SKD_RESTARTING_TIMO; | ||
3678 | |||
3679 | skd_soft_reset(skdev); | ||
3680 | } | ||
3681 | |||
3682 | /* assume spinlock is held */ | ||
3683 | static int skd_quiesce_dev(struct skd_device *skdev) | ||
3684 | { | ||
3685 | int rc = 0; | ||
3686 | |||
3687 | switch (skdev->state) { | ||
3688 | case SKD_DRVR_STATE_BUSY: | ||
3689 | case SKD_DRVR_STATE_BUSY_IMMINENT: | ||
3690 | pr_debug("%s:%s:%d stopping %s queue\n", | ||
3691 | skdev->name, __func__, __LINE__, skdev->name); | ||
3692 | blk_stop_queue(skdev->queue); | ||
3693 | break; | ||
3694 | case SKD_DRVR_STATE_ONLINE: | ||
3695 | case SKD_DRVR_STATE_STOPPING: | ||
3696 | case SKD_DRVR_STATE_SYNCING: | ||
3697 | case SKD_DRVR_STATE_PAUSING: | ||
3698 | case SKD_DRVR_STATE_PAUSED: | ||
3699 | case SKD_DRVR_STATE_STARTING: | ||
3700 | case SKD_DRVR_STATE_RESTARTING: | ||
3701 | case SKD_DRVR_STATE_RESUMING: | ||
3702 | default: | ||
3703 | rc = -EINVAL; | ||
3704 | pr_debug("%s:%s:%d state [%d] not implemented\n", | ||
3705 | skdev->name, __func__, __LINE__, skdev->state); | ||
3706 | } | ||
3707 | return rc; | ||
3708 | } | ||
3709 | |||
3710 | /* assume spinlock is held */ | ||
3711 | static int skd_unquiesce_dev(struct skd_device *skdev) | ||
3712 | { | ||
3713 | int prev_driver_state = skdev->state; | ||
3714 | |||
3715 | skd_log_skdev(skdev, "unquiesce"); | ||
3716 | if (skdev->state == SKD_DRVR_STATE_ONLINE) { | ||
3717 | pr_debug("%s:%s:%d **** device already ONLINE\n", | ||
3718 | skdev->name, __func__, __LINE__); | ||
3719 | return 0; | ||
3720 | } | ||
3721 | if (skdev->drive_state != FIT_SR_DRIVE_ONLINE) { | ||
3722 | /* | ||
3723 | * If there has been an state change to other than | ||
3724 | * ONLINE, we will rely on controller state change | ||
3725 | * to come back online and restart the queue. | ||
3726 | * The BUSY state means that driver is ready to | ||
3727 | * continue normal processing but waiting for controller | ||
3728 | * to become available. | ||
3729 | */ | ||
3730 | skdev->state = SKD_DRVR_STATE_BUSY; | ||
3731 | pr_debug("%s:%s:%d drive BUSY state\n", | ||
3732 | skdev->name, __func__, __LINE__); | ||
3733 | return 0; | ||
3734 | } | ||
3735 | |||
3736 | /* | ||
3737 | * Drive has just come online, driver is either in startup, | ||
3738 | * paused performing a task, or bust waiting for hardware. | ||
3739 | */ | ||
3740 | switch (skdev->state) { | ||
3741 | case SKD_DRVR_STATE_PAUSED: | ||
3742 | case SKD_DRVR_STATE_BUSY: | ||
3743 | case SKD_DRVR_STATE_BUSY_IMMINENT: | ||
3744 | case SKD_DRVR_STATE_BUSY_ERASE: | ||
3745 | case SKD_DRVR_STATE_STARTING: | ||
3746 | case SKD_DRVR_STATE_RESTARTING: | ||
3747 | case SKD_DRVR_STATE_FAULT: | ||
3748 | case SKD_DRVR_STATE_IDLE: | ||
3749 | case SKD_DRVR_STATE_LOAD: | ||
3750 | skdev->state = SKD_DRVR_STATE_ONLINE; | ||
3751 | pr_err("(%s): Driver state %s(%d)=>%s(%d)\n", | ||
3752 | skd_name(skdev), | ||
3753 | skd_skdev_state_to_str(prev_driver_state), | ||
3754 | prev_driver_state, skd_skdev_state_to_str(skdev->state), | ||
3755 | skdev->state); | ||
3756 | pr_debug("%s:%s:%d **** device ONLINE...starting block queue\n", | ||
3757 | skdev->name, __func__, __LINE__); | ||
3758 | pr_debug("%s:%s:%d starting %s queue\n", | ||
3759 | skdev->name, __func__, __LINE__, skdev->name); | ||
3760 | pr_info("(%s): STEC s1120 ONLINE\n", skd_name(skdev)); | ||
3761 | blk_start_queue(skdev->queue); | ||
3762 | skdev->gendisk_on = 1; | ||
3763 | wake_up_interruptible(&skdev->waitq); | ||
3764 | break; | ||
3765 | |||
3766 | case SKD_DRVR_STATE_DISAPPEARED: | ||
3767 | default: | ||
3768 | pr_debug("%s:%s:%d **** driver state %d, not implemented \n", | ||
3769 | skdev->name, __func__, __LINE__, | ||
3770 | skdev->state); | ||
3771 | return -EBUSY; | ||
3772 | } | ||
3773 | return 0; | ||
3774 | } | ||
3775 | |||
3776 | /* | ||
3777 | ***************************************************************************** | ||
3778 | * PCIe MSI/MSI-X INTERRUPT HANDLERS | ||
3779 | ***************************************************************************** | ||
3780 | */ | ||
3781 | |||
3782 | static irqreturn_t skd_reserved_isr(int irq, void *skd_host_data) | ||
3783 | { | ||
3784 | struct skd_device *skdev = skd_host_data; | ||
3785 | unsigned long flags; | ||
3786 | |||
3787 | spin_lock_irqsave(&skdev->lock, flags); | ||
3788 | pr_debug("%s:%s:%d MSIX = 0x%x\n", | ||
3789 | skdev->name, __func__, __LINE__, | ||
3790 | SKD_READL(skdev, FIT_INT_STATUS_HOST)); | ||
3791 | pr_err("(%s): MSIX reserved irq %d = 0x%x\n", skd_name(skdev), | ||
3792 | irq, SKD_READL(skdev, FIT_INT_STATUS_HOST)); | ||
3793 | SKD_WRITEL(skdev, FIT_INT_RESERVED_MASK, FIT_INT_STATUS_HOST); | ||
3794 | spin_unlock_irqrestore(&skdev->lock, flags); | ||
3795 | return IRQ_HANDLED; | ||
3796 | } | ||
3797 | |||
3798 | static irqreturn_t skd_statec_isr(int irq, void *skd_host_data) | ||
3799 | { | ||
3800 | struct skd_device *skdev = skd_host_data; | ||
3801 | unsigned long flags; | ||
3802 | |||
3803 | spin_lock_irqsave(&skdev->lock, flags); | ||
3804 | pr_debug("%s:%s:%d MSIX = 0x%x\n", | ||
3805 | skdev->name, __func__, __LINE__, | ||
3806 | SKD_READL(skdev, FIT_INT_STATUS_HOST)); | ||
3807 | SKD_WRITEL(skdev, FIT_ISH_FW_STATE_CHANGE, FIT_INT_STATUS_HOST); | ||
3808 | skd_isr_fwstate(skdev); | ||
3809 | spin_unlock_irqrestore(&skdev->lock, flags); | ||
3810 | return IRQ_HANDLED; | ||
3811 | } | ||
3812 | |||
3813 | static irqreturn_t skd_comp_q(int irq, void *skd_host_data) | ||
3814 | { | ||
3815 | struct skd_device *skdev = skd_host_data; | ||
3816 | unsigned long flags; | ||
3817 | int flush_enqueued = 0; | ||
3818 | int deferred; | ||
3819 | |||
3820 | spin_lock_irqsave(&skdev->lock, flags); | ||
3821 | pr_debug("%s:%s:%d MSIX = 0x%x\n", | ||
3822 | skdev->name, __func__, __LINE__, | ||
3823 | SKD_READL(skdev, FIT_INT_STATUS_HOST)); | ||
3824 | SKD_WRITEL(skdev, FIT_ISH_COMPLETION_POSTED, FIT_INT_STATUS_HOST); | ||
3825 | deferred = skd_isr_completion_posted(skdev, skd_isr_comp_limit, | ||
3826 | &flush_enqueued); | ||
3827 | if (flush_enqueued) | ||
3828 | skd_request_fn(skdev->queue); | ||
3829 | |||
3830 | if (deferred) | ||
3831 | schedule_work(&skdev->completion_worker); | ||
3832 | else if (!flush_enqueued) | ||
3833 | skd_request_fn(skdev->queue); | ||
3834 | |||
3835 | spin_unlock_irqrestore(&skdev->lock, flags); | ||
3836 | |||
3837 | return IRQ_HANDLED; | ||
3838 | } | ||
3839 | |||
3840 | static irqreturn_t skd_msg_isr(int irq, void *skd_host_data) | ||
3841 | { | ||
3842 | struct skd_device *skdev = skd_host_data; | ||
3843 | unsigned long flags; | ||
3844 | |||
3845 | spin_lock_irqsave(&skdev->lock, flags); | ||
3846 | pr_debug("%s:%s:%d MSIX = 0x%x\n", | ||
3847 | skdev->name, __func__, __LINE__, | ||
3848 | SKD_READL(skdev, FIT_INT_STATUS_HOST)); | ||
3849 | SKD_WRITEL(skdev, FIT_ISH_MSG_FROM_DEV, FIT_INT_STATUS_HOST); | ||
3850 | skd_isr_msg_from_dev(skdev); | ||
3851 | spin_unlock_irqrestore(&skdev->lock, flags); | ||
3852 | return IRQ_HANDLED; | ||
3853 | } | ||
3854 | |||
3855 | static irqreturn_t skd_qfull_isr(int irq, void *skd_host_data) | ||
3856 | { | ||
3857 | struct skd_device *skdev = skd_host_data; | ||
3858 | unsigned long flags; | ||
3859 | |||
3860 | spin_lock_irqsave(&skdev->lock, flags); | ||
3861 | pr_debug("%s:%s:%d MSIX = 0x%x\n", | ||
3862 | skdev->name, __func__, __LINE__, | ||
3863 | SKD_READL(skdev, FIT_INT_STATUS_HOST)); | ||
3864 | SKD_WRITEL(skdev, FIT_INT_QUEUE_FULL, FIT_INT_STATUS_HOST); | ||
3865 | spin_unlock_irqrestore(&skdev->lock, flags); | ||
3866 | return IRQ_HANDLED; | ||
3867 | } | ||
3868 | |||
3869 | /* | ||
3870 | ***************************************************************************** | ||
3871 | * PCIe MSI/MSI-X SETUP | ||
3872 | ***************************************************************************** | ||
3873 | */ | ||
3874 | |||
3875 | struct skd_msix_entry { | ||
3876 | int have_irq; | ||
3877 | u32 vector; | ||
3878 | u32 entry; | ||
3879 | struct skd_device *rsp; | ||
3880 | char isr_name[30]; | ||
3881 | }; | ||
3882 | |||
3883 | struct skd_init_msix_entry { | ||
3884 | const char *name; | ||
3885 | irq_handler_t handler; | ||
3886 | }; | ||
3887 | |||
3888 | #define SKD_MAX_MSIX_COUNT 13 | ||
3889 | #define SKD_MIN_MSIX_COUNT 7 | ||
3890 | #define SKD_BASE_MSIX_IRQ 4 | ||
3891 | |||
3892 | static struct skd_init_msix_entry msix_entries[SKD_MAX_MSIX_COUNT] = { | ||
3893 | { "(DMA 0)", skd_reserved_isr }, | ||
3894 | { "(DMA 1)", skd_reserved_isr }, | ||
3895 | { "(DMA 2)", skd_reserved_isr }, | ||
3896 | { "(DMA 3)", skd_reserved_isr }, | ||
3897 | { "(State Change)", skd_statec_isr }, | ||
3898 | { "(COMPL_Q)", skd_comp_q }, | ||
3899 | { "(MSG)", skd_msg_isr }, | ||
3900 | { "(Reserved)", skd_reserved_isr }, | ||
3901 | { "(Reserved)", skd_reserved_isr }, | ||
3902 | { "(Queue Full 0)", skd_qfull_isr }, | ||
3903 | { "(Queue Full 1)", skd_qfull_isr }, | ||
3904 | { "(Queue Full 2)", skd_qfull_isr }, | ||
3905 | { "(Queue Full 3)", skd_qfull_isr }, | ||
3906 | }; | ||
3907 | |||
3908 | static void skd_release_msix(struct skd_device *skdev) | ||
3909 | { | ||
3910 | struct skd_msix_entry *qentry; | ||
3911 | int i; | ||
3912 | |||
3913 | if (skdev->msix_entries == NULL) | ||
3914 | return; | ||
3915 | for (i = 0; i < skdev->msix_count; i++) { | ||
3916 | qentry = &skdev->msix_entries[i]; | ||
3917 | skdev = qentry->rsp; | ||
3918 | |||
3919 | if (qentry->have_irq) | ||
3920 | devm_free_irq(&skdev->pdev->dev, | ||
3921 | qentry->vector, qentry->rsp); | ||
3922 | } | ||
3923 | pci_disable_msix(skdev->pdev); | ||
3924 | kfree(skdev->msix_entries); | ||
3925 | skdev->msix_count = 0; | ||
3926 | skdev->msix_entries = NULL; | ||
3927 | } | ||
3928 | |||
3929 | static int skd_acquire_msix(struct skd_device *skdev) | ||
3930 | { | ||
3931 | int i, rc; | ||
3932 | struct pci_dev *pdev; | ||
3933 | struct msix_entry *entries = NULL; | ||
3934 | struct skd_msix_entry *qentry; | ||
3935 | |||
3936 | pdev = skdev->pdev; | ||
3937 | skdev->msix_count = SKD_MAX_MSIX_COUNT; | ||
3938 | entries = kzalloc(sizeof(struct msix_entry) * SKD_MAX_MSIX_COUNT, | ||
3939 | GFP_KERNEL); | ||
3940 | if (!entries) | ||
3941 | return -ENOMEM; | ||
3942 | |||
3943 | for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) | ||
3944 | entries[i].entry = i; | ||
3945 | |||
3946 | rc = pci_enable_msix(pdev, entries, SKD_MAX_MSIX_COUNT); | ||
3947 | if (rc < 0) | ||
3948 | goto msix_out; | ||
3949 | if (rc) { | ||
3950 | if (rc < SKD_MIN_MSIX_COUNT) { | ||
3951 | pr_err("(%s): failed to enable MSI-X %d\n", | ||
3952 | skd_name(skdev), rc); | ||
3953 | goto msix_out; | ||
3954 | } | ||
3955 | pr_debug("%s:%s:%d %s: <%s> allocated %d MSI-X vectors\n", | ||
3956 | skdev->name, __func__, __LINE__, | ||
3957 | pci_name(pdev), skdev->name, rc); | ||
3958 | |||
3959 | skdev->msix_count = rc; | ||
3960 | rc = pci_enable_msix(pdev, entries, skdev->msix_count); | ||
3961 | if (rc) { | ||
3962 | pr_err("(%s): failed to enable MSI-X " | ||
3963 | "support (%d) %d\n", | ||
3964 | skd_name(skdev), skdev->msix_count, rc); | ||
3965 | goto msix_out; | ||
3966 | } | ||
3967 | } | ||
3968 | skdev->msix_entries = kzalloc(sizeof(struct skd_msix_entry) * | ||
3969 | skdev->msix_count, GFP_KERNEL); | ||
3970 | if (!skdev->msix_entries) { | ||
3971 | rc = -ENOMEM; | ||
3972 | skdev->msix_count = 0; | ||
3973 | pr_err("(%s): msix table allocation error\n", | ||
3974 | skd_name(skdev)); | ||
3975 | goto msix_out; | ||
3976 | } | ||
3977 | |||
3978 | qentry = skdev->msix_entries; | ||
3979 | for (i = 0; i < skdev->msix_count; i++) { | ||
3980 | qentry->vector = entries[i].vector; | ||
3981 | qentry->entry = entries[i].entry; | ||
3982 | qentry->rsp = NULL; | ||
3983 | qentry->have_irq = 0; | ||
3984 | pr_debug("%s:%s:%d %s: <%s> msix (%d) vec %d, entry %x\n", | ||
3985 | skdev->name, __func__, __LINE__, | ||
3986 | pci_name(pdev), skdev->name, | ||
3987 | i, qentry->vector, qentry->entry); | ||
3988 | qentry++; | ||
3989 | } | ||
3990 | |||
3991 | /* Enable MSI-X vectors for the base queue */ | ||
3992 | for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) { | ||
3993 | qentry = &skdev->msix_entries[i]; | ||
3994 | snprintf(qentry->isr_name, sizeof(qentry->isr_name), | ||
3995 | "%s%d-msix %s", DRV_NAME, skdev->devno, | ||
3996 | msix_entries[i].name); | ||
3997 | rc = devm_request_irq(&skdev->pdev->dev, qentry->vector, | ||
3998 | msix_entries[i].handler, 0, | ||
3999 | qentry->isr_name, skdev); | ||
4000 | if (rc) { | ||
4001 | pr_err("(%s): Unable to register(%d) MSI-X " | ||
4002 | "handler %d: %s\n", | ||
4003 | skd_name(skdev), rc, i, qentry->isr_name); | ||
4004 | goto msix_out; | ||
4005 | } else { | ||
4006 | qentry->have_irq = 1; | ||
4007 | qentry->rsp = skdev; | ||
4008 | } | ||
4009 | } | ||
4010 | pr_debug("%s:%s:%d %s: <%s> msix %d irq(s) enabled\n", | ||
4011 | skdev->name, __func__, __LINE__, | ||
4012 | pci_name(pdev), skdev->name, skdev->msix_count); | ||
4013 | return 0; | ||
4014 | |||
4015 | msix_out: | ||
4016 | if (entries) | ||
4017 | kfree(entries); | ||
4018 | skd_release_msix(skdev); | ||
4019 | return rc; | ||
4020 | } | ||
4021 | |||
4022 | static int skd_acquire_irq(struct skd_device *skdev) | ||
4023 | { | ||
4024 | int rc; | ||
4025 | struct pci_dev *pdev; | ||
4026 | |||
4027 | pdev = skdev->pdev; | ||
4028 | skdev->msix_count = 0; | ||
4029 | |||
4030 | RETRY_IRQ_TYPE: | ||
4031 | switch (skdev->irq_type) { | ||
4032 | case SKD_IRQ_MSIX: | ||
4033 | rc = skd_acquire_msix(skdev); | ||
4034 | if (!rc) | ||
4035 | pr_info("(%s): MSI-X %d irqs enabled\n", | ||
4036 | skd_name(skdev), skdev->msix_count); | ||
4037 | else { | ||
4038 | pr_err( | ||
4039 | "(%s): failed to enable MSI-X, re-trying with MSI %d\n", | ||
4040 | skd_name(skdev), rc); | ||
4041 | skdev->irq_type = SKD_IRQ_MSI; | ||
4042 | goto RETRY_IRQ_TYPE; | ||
4043 | } | ||
4044 | break; | ||
4045 | case SKD_IRQ_MSI: | ||
4046 | snprintf(skdev->isr_name, sizeof(skdev->isr_name), "%s%d-msi", | ||
4047 | DRV_NAME, skdev->devno); | ||
4048 | rc = pci_enable_msi(pdev); | ||
4049 | if (!rc) { | ||
4050 | rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr, 0, | ||
4051 | skdev->isr_name, skdev); | ||
4052 | if (rc) { | ||
4053 | pci_disable_msi(pdev); | ||
4054 | pr_err( | ||
4055 | "(%s): failed to allocate the MSI interrupt %d\n", | ||
4056 | skd_name(skdev), rc); | ||
4057 | goto RETRY_IRQ_LEGACY; | ||
4058 | } | ||
4059 | pr_info("(%s): MSI irq %d enabled\n", | ||
4060 | skd_name(skdev), pdev->irq); | ||
4061 | } else { | ||
4062 | RETRY_IRQ_LEGACY: | ||
4063 | pr_err( | ||
4064 | "(%s): failed to enable MSI, re-trying with LEGACY %d\n", | ||
4065 | skd_name(skdev), rc); | ||
4066 | skdev->irq_type = SKD_IRQ_LEGACY; | ||
4067 | goto RETRY_IRQ_TYPE; | ||
4068 | } | ||
4069 | break; | ||
4070 | case SKD_IRQ_LEGACY: | ||
4071 | snprintf(skdev->isr_name, sizeof(skdev->isr_name), | ||
4072 | "%s%d-legacy", DRV_NAME, skdev->devno); | ||
4073 | rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr, | ||
4074 | IRQF_SHARED, skdev->isr_name, skdev); | ||
4075 | if (!rc) | ||
4076 | pr_info("(%s): LEGACY irq %d enabled\n", | ||
4077 | skd_name(skdev), pdev->irq); | ||
4078 | else | ||
4079 | pr_err("(%s): request LEGACY irq error %d\n", | ||
4080 | skd_name(skdev), rc); | ||
4081 | break; | ||
4082 | default: | ||
4083 | pr_info("(%s): irq_type %d invalid, re-set to %d\n", | ||
4084 | skd_name(skdev), skdev->irq_type, SKD_IRQ_DEFAULT); | ||
4085 | skdev->irq_type = SKD_IRQ_LEGACY; | ||
4086 | goto RETRY_IRQ_TYPE; | ||
4087 | } | ||
4088 | return rc; | ||
4089 | } | ||
4090 | |||
4091 | static void skd_release_irq(struct skd_device *skdev) | ||
4092 | { | ||
4093 | switch (skdev->irq_type) { | ||
4094 | case SKD_IRQ_MSIX: | ||
4095 | skd_release_msix(skdev); | ||
4096 | break; | ||
4097 | case SKD_IRQ_MSI: | ||
4098 | devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev); | ||
4099 | pci_disable_msi(skdev->pdev); | ||
4100 | break; | ||
4101 | case SKD_IRQ_LEGACY: | ||
4102 | devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev); | ||
4103 | break; | ||
4104 | default: | ||
4105 | pr_err("(%s): wrong irq type %d!", | ||
4106 | skd_name(skdev), skdev->irq_type); | ||
4107 | break; | ||
4108 | } | ||
4109 | } | ||
4110 | |||
4111 | /* | ||
4112 | ***************************************************************************** | ||
4113 | * CONSTRUCT | ||
4114 | ***************************************************************************** | ||
4115 | */ | ||
4116 | |||
4117 | static int skd_cons_skcomp(struct skd_device *skdev) | ||
4118 | { | ||
4119 | int rc = 0; | ||
4120 | struct fit_completion_entry_v1 *skcomp; | ||
4121 | u32 nbytes; | ||
4122 | |||
4123 | nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY; | ||
4124 | nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY; | ||
4125 | |||
4126 | pr_debug("%s:%s:%d comp pci_alloc, total bytes %d entries %d\n", | ||
4127 | skdev->name, __func__, __LINE__, | ||
4128 | nbytes, SKD_N_COMPLETION_ENTRY); | ||
4129 | |||
4130 | skcomp = pci_alloc_consistent(skdev->pdev, nbytes, | ||
4131 | &skdev->cq_dma_address); | ||
4132 | |||
4133 | if (skcomp == NULL) { | ||
4134 | rc = -ENOMEM; | ||
4135 | goto err_out; | ||
4136 | } | ||
4137 | |||
4138 | memset(skcomp, 0, nbytes); | ||
4139 | |||
4140 | skdev->skcomp_table = skcomp; | ||
4141 | skdev->skerr_table = (struct fit_comp_error_info *)((char *)skcomp + | ||
4142 | sizeof(*skcomp) * | ||
4143 | SKD_N_COMPLETION_ENTRY); | ||
4144 | |||
4145 | err_out: | ||
4146 | return rc; | ||
4147 | } | ||
4148 | |||
4149 | static int skd_cons_skmsg(struct skd_device *skdev) | ||
4150 | { | ||
4151 | int rc = 0; | ||
4152 | u32 i; | ||
4153 | |||
4154 | pr_debug("%s:%s:%d skmsg_table kzalloc, struct %lu, count %u total %lu\n", | ||
4155 | skdev->name, __func__, __LINE__, | ||
4156 | sizeof(struct skd_fitmsg_context), | ||
4157 | skdev->num_fitmsg_context, | ||
4158 | sizeof(struct skd_fitmsg_context) * skdev->num_fitmsg_context); | ||
4159 | |||
4160 | skdev->skmsg_table = kzalloc(sizeof(struct skd_fitmsg_context) | ||
4161 | *skdev->num_fitmsg_context, GFP_KERNEL); | ||
4162 | if (skdev->skmsg_table == NULL) { | ||
4163 | rc = -ENOMEM; | ||
4164 | goto err_out; | ||
4165 | } | ||
4166 | |||
4167 | for (i = 0; i < skdev->num_fitmsg_context; i++) { | ||
4168 | struct skd_fitmsg_context *skmsg; | ||
4169 | |||
4170 | skmsg = &skdev->skmsg_table[i]; | ||
4171 | |||
4172 | skmsg->id = i + SKD_ID_FIT_MSG; | ||
4173 | |||
4174 | skmsg->state = SKD_MSG_STATE_IDLE; | ||
4175 | skmsg->msg_buf = pci_alloc_consistent(skdev->pdev, | ||
4176 | SKD_N_FITMSG_BYTES + 64, | ||
4177 | &skmsg->mb_dma_address); | ||
4178 | |||
4179 | if (skmsg->msg_buf == NULL) { | ||
4180 | rc = -ENOMEM; | ||
4181 | goto err_out; | ||
4182 | } | ||
4183 | |||
4184 | skmsg->offset = (u32)((u64)skmsg->msg_buf & | ||
4185 | (~FIT_QCMD_BASE_ADDRESS_MASK)); | ||
4186 | skmsg->msg_buf += ~FIT_QCMD_BASE_ADDRESS_MASK; | ||
4187 | skmsg->msg_buf = (u8 *)((u64)skmsg->msg_buf & | ||
4188 | FIT_QCMD_BASE_ADDRESS_MASK); | ||
4189 | skmsg->mb_dma_address += ~FIT_QCMD_BASE_ADDRESS_MASK; | ||
4190 | skmsg->mb_dma_address &= FIT_QCMD_BASE_ADDRESS_MASK; | ||
4191 | memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES); | ||
4192 | |||
4193 | skmsg->next = &skmsg[1]; | ||
4194 | } | ||
4195 | |||
4196 | /* Free list is in order starting with the 0th entry. */ | ||
4197 | skdev->skmsg_table[i - 1].next = NULL; | ||
4198 | skdev->skmsg_free_list = skdev->skmsg_table; | ||
4199 | |||
4200 | err_out: | ||
4201 | return rc; | ||
4202 | } | ||
4203 | |||
4204 | static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev, | ||
4205 | u32 n_sg, | ||
4206 | dma_addr_t *ret_dma_addr) | ||
4207 | { | ||
4208 | struct fit_sg_descriptor *sg_list; | ||
4209 | u32 nbytes; | ||
4210 | |||
4211 | nbytes = sizeof(*sg_list) * n_sg; | ||
4212 | |||
4213 | sg_list = pci_alloc_consistent(skdev->pdev, nbytes, ret_dma_addr); | ||
4214 | |||
4215 | if (sg_list != NULL) { | ||
4216 | uint64_t dma_address = *ret_dma_addr; | ||
4217 | u32 i; | ||
4218 | |||
4219 | memset(sg_list, 0, nbytes); | ||
4220 | |||
4221 | for (i = 0; i < n_sg - 1; i++) { | ||
4222 | uint64_t ndp_off; | ||
4223 | ndp_off = (i + 1) * sizeof(struct fit_sg_descriptor); | ||
4224 | |||
4225 | sg_list[i].next_desc_ptr = dma_address + ndp_off; | ||
4226 | } | ||
4227 | sg_list[i].next_desc_ptr = 0LL; | ||
4228 | } | ||
4229 | |||
4230 | return sg_list; | ||
4231 | } | ||
4232 | |||
4233 | static int skd_cons_skreq(struct skd_device *skdev) | ||
4234 | { | ||
4235 | int rc = 0; | ||
4236 | u32 i; | ||
4237 | |||
4238 | pr_debug("%s:%s:%d skreq_table kzalloc, struct %lu, count %u total %lu\n", | ||
4239 | skdev->name, __func__, __LINE__, | ||
4240 | sizeof(struct skd_request_context), | ||
4241 | skdev->num_req_context, | ||
4242 | sizeof(struct skd_request_context) * skdev->num_req_context); | ||
4243 | |||
4244 | skdev->skreq_table = kzalloc(sizeof(struct skd_request_context) | ||
4245 | * skdev->num_req_context, GFP_KERNEL); | ||
4246 | if (skdev->skreq_table == NULL) { | ||
4247 | rc = -ENOMEM; | ||
4248 | goto err_out; | ||
4249 | } | ||
4250 | |||
4251 | pr_debug("%s:%s:%d alloc sg_table sg_per_req %u scatlist %lu total %lu\n", | ||
4252 | skdev->name, __func__, __LINE__, | ||
4253 | skdev->sgs_per_request, sizeof(struct scatterlist), | ||
4254 | skdev->sgs_per_request * sizeof(struct scatterlist)); | ||
4255 | |||
4256 | for (i = 0; i < skdev->num_req_context; i++) { | ||
4257 | struct skd_request_context *skreq; | ||
4258 | |||
4259 | skreq = &skdev->skreq_table[i]; | ||
4260 | |||
4261 | skreq->id = i + SKD_ID_RW_REQUEST; | ||
4262 | skreq->state = SKD_REQ_STATE_IDLE; | ||
4263 | |||
4264 | skreq->sg = kzalloc(sizeof(struct scatterlist) * | ||
4265 | skdev->sgs_per_request, GFP_KERNEL); | ||
4266 | if (skreq->sg == NULL) { | ||
4267 | rc = -ENOMEM; | ||
4268 | goto err_out; | ||
4269 | } | ||
4270 | sg_init_table(skreq->sg, skdev->sgs_per_request); | ||
4271 | |||
4272 | skreq->sksg_list = skd_cons_sg_list(skdev, | ||
4273 | skdev->sgs_per_request, | ||
4274 | &skreq->sksg_dma_address); | ||
4275 | |||
4276 | if (skreq->sksg_list == NULL) { | ||
4277 | rc = -ENOMEM; | ||
4278 | goto err_out; | ||
4279 | } | ||
4280 | |||
4281 | skreq->next = &skreq[1]; | ||
4282 | } | ||
4283 | |||
4284 | /* Free list is in order starting with the 0th entry. */ | ||
4285 | skdev->skreq_table[i - 1].next = NULL; | ||
4286 | skdev->skreq_free_list = skdev->skreq_table; | ||
4287 | |||
4288 | err_out: | ||
4289 | return rc; | ||
4290 | } | ||
4291 | |||
4292 | static int skd_cons_skspcl(struct skd_device *skdev) | ||
4293 | { | ||
4294 | int rc = 0; | ||
4295 | u32 i, nbytes; | ||
4296 | |||
4297 | pr_debug("%s:%s:%d skspcl_table kzalloc, struct %lu, count %u total %lu\n", | ||
4298 | skdev->name, __func__, __LINE__, | ||
4299 | sizeof(struct skd_special_context), | ||
4300 | skdev->n_special, | ||
4301 | sizeof(struct skd_special_context) * skdev->n_special); | ||
4302 | |||
4303 | skdev->skspcl_table = kzalloc(sizeof(struct skd_special_context) | ||
4304 | * skdev->n_special, GFP_KERNEL); | ||
4305 | if (skdev->skspcl_table == NULL) { | ||
4306 | rc = -ENOMEM; | ||
4307 | goto err_out; | ||
4308 | } | ||
4309 | |||
4310 | for (i = 0; i < skdev->n_special; i++) { | ||
4311 | struct skd_special_context *skspcl; | ||
4312 | |||
4313 | skspcl = &skdev->skspcl_table[i]; | ||
4314 | |||
4315 | skspcl->req.id = i + SKD_ID_SPECIAL_REQUEST; | ||
4316 | skspcl->req.state = SKD_REQ_STATE_IDLE; | ||
4317 | |||
4318 | skspcl->req.next = &skspcl[1].req; | ||
4319 | |||
4320 | nbytes = SKD_N_SPECIAL_FITMSG_BYTES; | ||
4321 | |||
4322 | skspcl->msg_buf = pci_alloc_consistent(skdev->pdev, nbytes, | ||
4323 | &skspcl->mb_dma_address); | ||
4324 | if (skspcl->msg_buf == NULL) { | ||
4325 | rc = -ENOMEM; | ||
4326 | goto err_out; | ||
4327 | } | ||
4328 | |||
4329 | memset(skspcl->msg_buf, 0, nbytes); | ||
4330 | |||
4331 | skspcl->req.sg = kzalloc(sizeof(struct scatterlist) * | ||
4332 | SKD_N_SG_PER_SPECIAL, GFP_KERNEL); | ||
4333 | if (skspcl->req.sg == NULL) { | ||
4334 | rc = -ENOMEM; | ||
4335 | goto err_out; | ||
4336 | } | ||
4337 | |||
4338 | skspcl->req.sksg_list = skd_cons_sg_list(skdev, | ||
4339 | SKD_N_SG_PER_SPECIAL, | ||
4340 | &skspcl->req. | ||
4341 | sksg_dma_address); | ||
4342 | if (skspcl->req.sksg_list == NULL) { | ||
4343 | rc = -ENOMEM; | ||
4344 | goto err_out; | ||
4345 | } | ||
4346 | } | ||
4347 | |||
4348 | /* Free list is in order starting with the 0th entry. */ | ||
4349 | skdev->skspcl_table[i - 1].req.next = NULL; | ||
4350 | skdev->skspcl_free_list = skdev->skspcl_table; | ||
4351 | |||
4352 | return rc; | ||
4353 | |||
4354 | err_out: | ||
4355 | return rc; | ||
4356 | } | ||
4357 | |||
4358 | static int skd_cons_sksb(struct skd_device *skdev) | ||
4359 | { | ||
4360 | int rc = 0; | ||
4361 | struct skd_special_context *skspcl; | ||
4362 | u32 nbytes; | ||
4363 | |||
4364 | skspcl = &skdev->internal_skspcl; | ||
4365 | |||
4366 | skspcl->req.id = 0 + SKD_ID_INTERNAL; | ||
4367 | skspcl->req.state = SKD_REQ_STATE_IDLE; | ||
4368 | |||
4369 | nbytes = SKD_N_INTERNAL_BYTES; | ||
4370 | |||
4371 | skspcl->data_buf = pci_alloc_consistent(skdev->pdev, nbytes, | ||
4372 | &skspcl->db_dma_address); | ||
4373 | if (skspcl->data_buf == NULL) { | ||
4374 | rc = -ENOMEM; | ||
4375 | goto err_out; | ||
4376 | } | ||
4377 | |||
4378 | memset(skspcl->data_buf, 0, nbytes); | ||
4379 | |||
4380 | nbytes = SKD_N_SPECIAL_FITMSG_BYTES; | ||
4381 | skspcl->msg_buf = pci_alloc_consistent(skdev->pdev, nbytes, | ||
4382 | &skspcl->mb_dma_address); | ||
4383 | if (skspcl->msg_buf == NULL) { | ||
4384 | rc = -ENOMEM; | ||
4385 | goto err_out; | ||
4386 | } | ||
4387 | |||
4388 | memset(skspcl->msg_buf, 0, nbytes); | ||
4389 | |||
4390 | skspcl->req.sksg_list = skd_cons_sg_list(skdev, 1, | ||
4391 | &skspcl->req.sksg_dma_address); | ||
4392 | if (skspcl->req.sksg_list == NULL) { | ||
4393 | rc = -ENOMEM; | ||
4394 | goto err_out; | ||
4395 | } | ||
4396 | |||
4397 | if (!skd_format_internal_skspcl(skdev)) { | ||
4398 | rc = -EINVAL; | ||
4399 | goto err_out; | ||
4400 | } | ||
4401 | |||
4402 | err_out: | ||
4403 | return rc; | ||
4404 | } | ||
4405 | |||
4406 | static int skd_cons_disk(struct skd_device *skdev) | ||
4407 | { | ||
4408 | int rc = 0; | ||
4409 | struct gendisk *disk; | ||
4410 | struct request_queue *q; | ||
4411 | unsigned long flags; | ||
4412 | |||
4413 | disk = alloc_disk(SKD_MINORS_PER_DEVICE); | ||
4414 | if (!disk) { | ||
4415 | rc = -ENOMEM; | ||
4416 | goto err_out; | ||
4417 | } | ||
4418 | |||
4419 | skdev->disk = disk; | ||
4420 | sprintf(disk->disk_name, DRV_NAME "%u", skdev->devno); | ||
4421 | |||
4422 | disk->major = skdev->major; | ||
4423 | disk->first_minor = skdev->devno * SKD_MINORS_PER_DEVICE; | ||
4424 | disk->fops = &skd_blockdev_ops; | ||
4425 | disk->private_data = skdev; | ||
4426 | |||
4427 | q = blk_init_queue(skd_request_fn, &skdev->lock); | ||
4428 | if (!q) { | ||
4429 | rc = -ENOMEM; | ||
4430 | goto err_out; | ||
4431 | } | ||
4432 | |||
4433 | skdev->queue = q; | ||
4434 | disk->queue = q; | ||
4435 | q->queuedata = skdev; | ||
4436 | |||
4437 | blk_queue_flush(q, REQ_FLUSH | REQ_FUA); | ||
4438 | blk_queue_max_segments(q, skdev->sgs_per_request); | ||
4439 | blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS); | ||
4440 | |||
4441 | /* set sysfs ptimal_io_size to 8K */ | ||
4442 | blk_queue_io_opt(q, 8192); | ||
4443 | |||
4444 | /* DISCARD Flag initialization. */ | ||
4445 | q->limits.discard_granularity = 8192; | ||
4446 | q->limits.discard_alignment = 0; | ||
4447 | q->limits.max_discard_sectors = UINT_MAX >> 9; | ||
4448 | q->limits.discard_zeroes_data = 1; | ||
4449 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); | ||
4450 | queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); | ||
4451 | |||
4452 | spin_lock_irqsave(&skdev->lock, flags); | ||
4453 | pr_debug("%s:%s:%d stopping %s queue\n", | ||
4454 | skdev->name, __func__, __LINE__, skdev->name); | ||
4455 | blk_stop_queue(skdev->queue); | ||
4456 | spin_unlock_irqrestore(&skdev->lock, flags); | ||
4457 | |||
4458 | err_out: | ||
4459 | return rc; | ||
4460 | } | ||
4461 | |||
4462 | #define SKD_N_DEV_TABLE 16u | ||
4463 | static u32 skd_next_devno; | ||
4464 | |||
4465 | static struct skd_device *skd_construct(struct pci_dev *pdev) | ||
4466 | { | ||
4467 | struct skd_device *skdev; | ||
4468 | int blk_major = skd_major; | ||
4469 | int rc; | ||
4470 | |||
4471 | skdev = kzalloc(sizeof(*skdev), GFP_KERNEL); | ||
4472 | |||
4473 | if (!skdev) { | ||
4474 | pr_err(PFX "(%s): memory alloc failure\n", | ||
4475 | pci_name(pdev)); | ||
4476 | return NULL; | ||
4477 | } | ||
4478 | |||
4479 | skdev->state = SKD_DRVR_STATE_LOAD; | ||
4480 | skdev->pdev = pdev; | ||
4481 | skdev->devno = skd_next_devno++; | ||
4482 | skdev->major = blk_major; | ||
4483 | skdev->irq_type = skd_isr_type; | ||
4484 | sprintf(skdev->name, DRV_NAME "%d", skdev->devno); | ||
4485 | skdev->dev_max_queue_depth = 0; | ||
4486 | |||
4487 | skdev->num_req_context = skd_max_queue_depth; | ||
4488 | skdev->num_fitmsg_context = skd_max_queue_depth; | ||
4489 | skdev->n_special = skd_max_pass_thru; | ||
4490 | skdev->cur_max_queue_depth = 1; | ||
4491 | skdev->queue_low_water_mark = 1; | ||
4492 | skdev->proto_ver = 99; | ||
4493 | skdev->sgs_per_request = skd_sgs_per_request; | ||
4494 | skdev->dbg_level = skd_dbg_level; | ||
4495 | |||
4496 | atomic_set(&skdev->device_count, 0); | ||
4497 | |||
4498 | spin_lock_init(&skdev->lock); | ||
4499 | |||
4500 | INIT_WORK(&skdev->completion_worker, skd_completion_worker); | ||
4501 | |||
4502 | pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__); | ||
4503 | rc = skd_cons_skcomp(skdev); | ||
4504 | if (rc < 0) | ||
4505 | goto err_out; | ||
4506 | |||
4507 | pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__); | ||
4508 | rc = skd_cons_skmsg(skdev); | ||
4509 | if (rc < 0) | ||
4510 | goto err_out; | ||
4511 | |||
4512 | pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__); | ||
4513 | rc = skd_cons_skreq(skdev); | ||
4514 | if (rc < 0) | ||
4515 | goto err_out; | ||
4516 | |||
4517 | pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__); | ||
4518 | rc = skd_cons_skspcl(skdev); | ||
4519 | if (rc < 0) | ||
4520 | goto err_out; | ||
4521 | |||
4522 | pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__); | ||
4523 | rc = skd_cons_sksb(skdev); | ||
4524 | if (rc < 0) | ||
4525 | goto err_out; | ||
4526 | |||
4527 | pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__); | ||
4528 | rc = skd_cons_disk(skdev); | ||
4529 | if (rc < 0) | ||
4530 | goto err_out; | ||
4531 | |||
4532 | pr_debug("%s:%s:%d VICTORY\n", skdev->name, __func__, __LINE__); | ||
4533 | return skdev; | ||
4534 | |||
4535 | err_out: | ||
4536 | pr_debug("%s:%s:%d construct failed\n", | ||
4537 | skdev->name, __func__, __LINE__); | ||
4538 | skd_destruct(skdev); | ||
4539 | return NULL; | ||
4540 | } | ||
4541 | |||
4542 | /* | ||
4543 | ***************************************************************************** | ||
4544 | * DESTRUCT (FREE) | ||
4545 | ***************************************************************************** | ||
4546 | */ | ||
4547 | |||
4548 | static void skd_free_skcomp(struct skd_device *skdev) | ||
4549 | { | ||
4550 | if (skdev->skcomp_table != NULL) { | ||
4551 | u32 nbytes; | ||
4552 | |||
4553 | nbytes = sizeof(skdev->skcomp_table[0]) * | ||
4554 | SKD_N_COMPLETION_ENTRY; | ||
4555 | pci_free_consistent(skdev->pdev, nbytes, | ||
4556 | skdev->skcomp_table, skdev->cq_dma_address); | ||
4557 | } | ||
4558 | |||
4559 | skdev->skcomp_table = NULL; | ||
4560 | skdev->cq_dma_address = 0; | ||
4561 | } | ||
4562 | |||
4563 | static void skd_free_skmsg(struct skd_device *skdev) | ||
4564 | { | ||
4565 | u32 i; | ||
4566 | |||
4567 | if (skdev->skmsg_table == NULL) | ||
4568 | return; | ||
4569 | |||
4570 | for (i = 0; i < skdev->num_fitmsg_context; i++) { | ||
4571 | struct skd_fitmsg_context *skmsg; | ||
4572 | |||
4573 | skmsg = &skdev->skmsg_table[i]; | ||
4574 | |||
4575 | if (skmsg->msg_buf != NULL) { | ||
4576 | skmsg->msg_buf += skmsg->offset; | ||
4577 | skmsg->mb_dma_address += skmsg->offset; | ||
4578 | pci_free_consistent(skdev->pdev, SKD_N_FITMSG_BYTES, | ||
4579 | skmsg->msg_buf, | ||
4580 | skmsg->mb_dma_address); | ||
4581 | } | ||
4582 | skmsg->msg_buf = NULL; | ||
4583 | skmsg->mb_dma_address = 0; | ||
4584 | } | ||
4585 | |||
4586 | kfree(skdev->skmsg_table); | ||
4587 | skdev->skmsg_table = NULL; | ||
4588 | } | ||
4589 | |||
4590 | static void skd_free_sg_list(struct skd_device *skdev, | ||
4591 | struct fit_sg_descriptor *sg_list, | ||
4592 | u32 n_sg, dma_addr_t dma_addr) | ||
4593 | { | ||
4594 | if (sg_list != NULL) { | ||
4595 | u32 nbytes; | ||
4596 | |||
4597 | nbytes = sizeof(*sg_list) * n_sg; | ||
4598 | |||
4599 | pci_free_consistent(skdev->pdev, nbytes, sg_list, dma_addr); | ||
4600 | } | ||
4601 | } | ||
4602 | |||
4603 | static void skd_free_skreq(struct skd_device *skdev) | ||
4604 | { | ||
4605 | u32 i; | ||
4606 | |||
4607 | if (skdev->skreq_table == NULL) | ||
4608 | return; | ||
4609 | |||
4610 | for (i = 0; i < skdev->num_req_context; i++) { | ||
4611 | struct skd_request_context *skreq; | ||
4612 | |||
4613 | skreq = &skdev->skreq_table[i]; | ||
4614 | |||
4615 | skd_free_sg_list(skdev, skreq->sksg_list, | ||
4616 | skdev->sgs_per_request, | ||
4617 | skreq->sksg_dma_address); | ||
4618 | |||
4619 | skreq->sksg_list = NULL; | ||
4620 | skreq->sksg_dma_address = 0; | ||
4621 | |||
4622 | kfree(skreq->sg); | ||
4623 | } | ||
4624 | |||
4625 | kfree(skdev->skreq_table); | ||
4626 | skdev->skreq_table = NULL; | ||
4627 | } | ||
4628 | |||
4629 | static void skd_free_skspcl(struct skd_device *skdev) | ||
4630 | { | ||
4631 | u32 i; | ||
4632 | u32 nbytes; | ||
4633 | |||
4634 | if (skdev->skspcl_table == NULL) | ||
4635 | return; | ||
4636 | |||
4637 | for (i = 0; i < skdev->n_special; i++) { | ||
4638 | struct skd_special_context *skspcl; | ||
4639 | |||
4640 | skspcl = &skdev->skspcl_table[i]; | ||
4641 | |||
4642 | if (skspcl->msg_buf != NULL) { | ||
4643 | nbytes = SKD_N_SPECIAL_FITMSG_BYTES; | ||
4644 | pci_free_consistent(skdev->pdev, nbytes, | ||
4645 | skspcl->msg_buf, | ||
4646 | skspcl->mb_dma_address); | ||
4647 | } | ||
4648 | |||
4649 | skspcl->msg_buf = NULL; | ||
4650 | skspcl->mb_dma_address = 0; | ||
4651 | |||
4652 | skd_free_sg_list(skdev, skspcl->req.sksg_list, | ||
4653 | SKD_N_SG_PER_SPECIAL, | ||
4654 | skspcl->req.sksg_dma_address); | ||
4655 | |||
4656 | skspcl->req.sksg_list = NULL; | ||
4657 | skspcl->req.sksg_dma_address = 0; | ||
4658 | |||
4659 | kfree(skspcl->req.sg); | ||
4660 | } | ||
4661 | |||
4662 | kfree(skdev->skspcl_table); | ||
4663 | skdev->skspcl_table = NULL; | ||
4664 | } | ||
4665 | |||
4666 | static void skd_free_sksb(struct skd_device *skdev) | ||
4667 | { | ||
4668 | struct skd_special_context *skspcl; | ||
4669 | u32 nbytes; | ||
4670 | |||
4671 | skspcl = &skdev->internal_skspcl; | ||
4672 | |||
4673 | if (skspcl->data_buf != NULL) { | ||
4674 | nbytes = SKD_N_INTERNAL_BYTES; | ||
4675 | |||
4676 | pci_free_consistent(skdev->pdev, nbytes, | ||
4677 | skspcl->data_buf, skspcl->db_dma_address); | ||
4678 | } | ||
4679 | |||
4680 | skspcl->data_buf = NULL; | ||
4681 | skspcl->db_dma_address = 0; | ||
4682 | |||
4683 | if (skspcl->msg_buf != NULL) { | ||
4684 | nbytes = SKD_N_SPECIAL_FITMSG_BYTES; | ||
4685 | pci_free_consistent(skdev->pdev, nbytes, | ||
4686 | skspcl->msg_buf, skspcl->mb_dma_address); | ||
4687 | } | ||
4688 | |||
4689 | skspcl->msg_buf = NULL; | ||
4690 | skspcl->mb_dma_address = 0; | ||
4691 | |||
4692 | skd_free_sg_list(skdev, skspcl->req.sksg_list, 1, | ||
4693 | skspcl->req.sksg_dma_address); | ||
4694 | |||
4695 | skspcl->req.sksg_list = NULL; | ||
4696 | skspcl->req.sksg_dma_address = 0; | ||
4697 | } | ||
4698 | |||
4699 | static void skd_free_disk(struct skd_device *skdev) | ||
4700 | { | ||
4701 | struct gendisk *disk = skdev->disk; | ||
4702 | |||
4703 | if (disk != NULL) { | ||
4704 | struct request_queue *q = disk->queue; | ||
4705 | |||
4706 | if (disk->flags & GENHD_FL_UP) | ||
4707 | del_gendisk(disk); | ||
4708 | if (q) | ||
4709 | blk_cleanup_queue(q); | ||
4710 | put_disk(disk); | ||
4711 | } | ||
4712 | skdev->disk = NULL; | ||
4713 | } | ||
4714 | |||
4715 | static void skd_destruct(struct skd_device *skdev) | ||
4716 | { | ||
4717 | if (skdev == NULL) | ||
4718 | return; | ||
4719 | |||
4720 | |||
4721 | pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__); | ||
4722 | skd_free_disk(skdev); | ||
4723 | |||
4724 | pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__); | ||
4725 | skd_free_sksb(skdev); | ||
4726 | |||
4727 | pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__); | ||
4728 | skd_free_skspcl(skdev); | ||
4729 | |||
4730 | pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__); | ||
4731 | skd_free_skreq(skdev); | ||
4732 | |||
4733 | pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__); | ||
4734 | skd_free_skmsg(skdev); | ||
4735 | |||
4736 | pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__); | ||
4737 | skd_free_skcomp(skdev); | ||
4738 | |||
4739 | pr_debug("%s:%s:%d skdev\n", skdev->name, __func__, __LINE__); | ||
4740 | kfree(skdev); | ||
4741 | } | ||
4742 | |||
4743 | /* | ||
4744 | ***************************************************************************** | ||
4745 | * BLOCK DEVICE (BDEV) GLUE | ||
4746 | ***************************************************************************** | ||
4747 | */ | ||
4748 | |||
4749 | static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo) | ||
4750 | { | ||
4751 | struct skd_device *skdev; | ||
4752 | u64 capacity; | ||
4753 | |||
4754 | skdev = bdev->bd_disk->private_data; | ||
4755 | |||
4756 | pr_debug("%s:%s:%d %s: CMD[%s] getgeo device\n", | ||
4757 | skdev->name, __func__, __LINE__, | ||
4758 | bdev->bd_disk->disk_name, current->comm); | ||
4759 | |||
4760 | if (skdev->read_cap_is_valid) { | ||
4761 | capacity = get_capacity(skdev->disk); | ||
4762 | geo->heads = 64; | ||
4763 | geo->sectors = 255; | ||
4764 | geo->cylinders = (capacity) / (255 * 64); | ||
4765 | |||
4766 | return 0; | ||
4767 | } | ||
4768 | return -EIO; | ||
4769 | } | ||
4770 | |||
4771 | static int skd_bdev_attach(struct skd_device *skdev) | ||
4772 | { | ||
4773 | pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__); | ||
4774 | add_disk(skdev->disk); | ||
4775 | return 0; | ||
4776 | } | ||
4777 | |||
4778 | static const struct block_device_operations skd_blockdev_ops = { | ||
4779 | .owner = THIS_MODULE, | ||
4780 | .ioctl = skd_bdev_ioctl, | ||
4781 | .getgeo = skd_bdev_getgeo, | ||
4782 | }; | ||
4783 | |||
4784 | |||
4785 | /* | ||
4786 | ***************************************************************************** | ||
4787 | * PCIe DRIVER GLUE | ||
4788 | ***************************************************************************** | ||
4789 | */ | ||
4790 | |||
4791 | static DEFINE_PCI_DEVICE_TABLE(skd_pci_tbl) = { | ||
4792 | { PCI_VENDOR_ID_STEC, PCI_DEVICE_ID_S1120, | ||
4793 | PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, | ||
4794 | { 0 } /* terminate list */ | ||
4795 | }; | ||
4796 | |||
4797 | MODULE_DEVICE_TABLE(pci, skd_pci_tbl); | ||
4798 | |||
4799 | static char *skd_pci_info(struct skd_device *skdev, char *str) | ||
4800 | { | ||
4801 | int pcie_reg; | ||
4802 | |||
4803 | strcpy(str, "PCIe ("); | ||
4804 | pcie_reg = pci_find_capability(skdev->pdev, PCI_CAP_ID_EXP); | ||
4805 | |||
4806 | if (pcie_reg) { | ||
4807 | |||
4808 | char lwstr[6]; | ||
4809 | uint16_t pcie_lstat, lspeed, lwidth; | ||
4810 | |||
4811 | pcie_reg += 0x12; | ||
4812 | pci_read_config_word(skdev->pdev, pcie_reg, &pcie_lstat); | ||
4813 | lspeed = pcie_lstat & (0xF); | ||
4814 | lwidth = (pcie_lstat & 0x3F0) >> 4; | ||
4815 | |||
4816 | if (lspeed == 1) | ||
4817 | strcat(str, "2.5GT/s "); | ||
4818 | else if (lspeed == 2) | ||
4819 | strcat(str, "5.0GT/s "); | ||
4820 | else | ||
4821 | strcat(str, "<unknown> "); | ||
4822 | snprintf(lwstr, sizeof(lwstr), "%dX)", lwidth); | ||
4823 | strcat(str, lwstr); | ||
4824 | } | ||
4825 | return str; | ||
4826 | } | ||
4827 | |||
4828 | static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) | ||
4829 | { | ||
4830 | int i; | ||
4831 | int rc = 0; | ||
4832 | char pci_str[32]; | ||
4833 | struct skd_device *skdev; | ||
4834 | |||
4835 | pr_info("STEC s1120 Driver(%s) version %s-b%s\n", | ||
4836 | DRV_NAME, DRV_VERSION, DRV_BUILD_ID); | ||
4837 | pr_info("(skd?:??:[%s]): vendor=%04X device=%04x\n", | ||
4838 | pci_name(pdev), pdev->vendor, pdev->device); | ||
4839 | |||
4840 | rc = pci_enable_device(pdev); | ||
4841 | if (rc) | ||
4842 | return rc; | ||
4843 | rc = pci_request_regions(pdev, DRV_NAME); | ||
4844 | if (rc) | ||
4845 | goto err_out; | ||
4846 | rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); | ||
4847 | if (!rc) { | ||
4848 | if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) { | ||
4849 | |||
4850 | pr_err("(%s): consistent DMA mask error %d\n", | ||
4851 | pci_name(pdev), rc); | ||
4852 | } | ||
4853 | } else { | ||
4854 | (rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32))); | ||
4855 | if (rc) { | ||
4856 | |||
4857 | pr_err("(%s): DMA mask error %d\n", | ||
4858 | pci_name(pdev), rc); | ||
4859 | goto err_out_regions; | ||
4860 | } | ||
4861 | } | ||
4862 | |||
4863 | if (!skd_major) { | ||
4864 | rc = register_blkdev(0, DRV_NAME); | ||
4865 | if (rc < 0) | ||
4866 | goto err_out_regions; | ||
4867 | BUG_ON(!rc); | ||
4868 | skd_major = rc; | ||
4869 | } | ||
4870 | |||
4871 | skdev = skd_construct(pdev); | ||
4872 | if (skdev == NULL) { | ||
4873 | rc = -ENOMEM; | ||
4874 | goto err_out_regions; | ||
4875 | } | ||
4876 | |||
4877 | skd_pci_info(skdev, pci_str); | ||
4878 | pr_info("(%s): %s 64bit\n", skd_name(skdev), pci_str); | ||
4879 | |||
4880 | pci_set_master(pdev); | ||
4881 | rc = pci_enable_pcie_error_reporting(pdev); | ||
4882 | if (rc) { | ||
4883 | pr_err( | ||
4884 | "(%s): bad enable of PCIe error reporting rc=%d\n", | ||
4885 | skd_name(skdev), rc); | ||
4886 | skdev->pcie_error_reporting_is_enabled = 0; | ||
4887 | } else | ||
4888 | skdev->pcie_error_reporting_is_enabled = 1; | ||
4889 | |||
4890 | |||
4891 | pci_set_drvdata(pdev, skdev); | ||
4892 | |||
4893 | skdev->disk->driverfs_dev = &pdev->dev; | ||
4894 | |||
4895 | for (i = 0; i < SKD_MAX_BARS; i++) { | ||
4896 | skdev->mem_phys[i] = pci_resource_start(pdev, i); | ||
4897 | skdev->mem_size[i] = (u32)pci_resource_len(pdev, i); | ||
4898 | skdev->mem_map[i] = ioremap(skdev->mem_phys[i], | ||
4899 | skdev->mem_size[i]); | ||
4900 | if (!skdev->mem_map[i]) { | ||
4901 | pr_err("(%s): Unable to map adapter memory!\n", | ||
4902 | skd_name(skdev)); | ||
4903 | rc = -ENODEV; | ||
4904 | goto err_out_iounmap; | ||
4905 | } | ||
4906 | pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n", | ||
4907 | skdev->name, __func__, __LINE__, | ||
4908 | skdev->mem_map[i], | ||
4909 | (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]); | ||
4910 | } | ||
4911 | |||
4912 | rc = skd_acquire_irq(skdev); | ||
4913 | if (rc) { | ||
4914 | pr_err("(%s): interrupt resource error %d\n", | ||
4915 | skd_name(skdev), rc); | ||
4916 | goto err_out_iounmap; | ||
4917 | } | ||
4918 | |||
4919 | rc = skd_start_timer(skdev); | ||
4920 | if (rc) | ||
4921 | goto err_out_timer; | ||
4922 | |||
4923 | init_waitqueue_head(&skdev->waitq); | ||
4924 | |||
4925 | skd_start_device(skdev); | ||
4926 | |||
4927 | rc = wait_event_interruptible_timeout(skdev->waitq, | ||
4928 | (skdev->gendisk_on), | ||
4929 | (SKD_START_WAIT_SECONDS * HZ)); | ||
4930 | if (skdev->gendisk_on > 0) { | ||
4931 | /* device came on-line after reset */ | ||
4932 | skd_bdev_attach(skdev); | ||
4933 | rc = 0; | ||
4934 | } else { | ||
4935 | /* we timed out, something is wrong with the device, | ||
4936 | don't add the disk structure */ | ||
4937 | pr_err( | ||
4938 | "(%s): error: waiting for s1120 timed out %d!\n", | ||
4939 | skd_name(skdev), rc); | ||
4940 | /* in case of no error; we timeout with ENXIO */ | ||
4941 | if (!rc) | ||
4942 | rc = -ENXIO; | ||
4943 | goto err_out_timer; | ||
4944 | } | ||
4945 | |||
4946 | |||
4947 | #ifdef SKD_VMK_POLL_HANDLER | ||
4948 | if (skdev->irq_type == SKD_IRQ_MSIX) { | ||
4949 | /* MSIX completion handler is being used for coredump */ | ||
4950 | vmklnx_scsi_register_poll_handler(skdev->scsi_host, | ||
4951 | skdev->msix_entries[5].vector, | ||
4952 | skd_comp_q, skdev); | ||
4953 | } else { | ||
4954 | vmklnx_scsi_register_poll_handler(skdev->scsi_host, | ||
4955 | skdev->pdev->irq, skd_isr, | ||
4956 | skdev); | ||
4957 | } | ||
4958 | #endif /* SKD_VMK_POLL_HANDLER */ | ||
4959 | |||
4960 | return rc; | ||
4961 | |||
4962 | err_out_timer: | ||
4963 | skd_stop_device(skdev); | ||
4964 | skd_release_irq(skdev); | ||
4965 | |||
4966 | err_out_iounmap: | ||
4967 | for (i = 0; i < SKD_MAX_BARS; i++) | ||
4968 | if (skdev->mem_map[i]) | ||
4969 | iounmap(skdev->mem_map[i]); | ||
4970 | |||
4971 | if (skdev->pcie_error_reporting_is_enabled) | ||
4972 | pci_disable_pcie_error_reporting(pdev); | ||
4973 | |||
4974 | skd_destruct(skdev); | ||
4975 | |||
4976 | err_out_regions: | ||
4977 | pci_release_regions(pdev); | ||
4978 | |||
4979 | err_out: | ||
4980 | pci_disable_device(pdev); | ||
4981 | pci_set_drvdata(pdev, NULL); | ||
4982 | return rc; | ||
4983 | } | ||
4984 | |||
4985 | static void skd_pci_remove(struct pci_dev *pdev) | ||
4986 | { | ||
4987 | int i; | ||
4988 | struct skd_device *skdev; | ||
4989 | |||
4990 | skdev = pci_get_drvdata(pdev); | ||
4991 | if (!skdev) { | ||
4992 | pr_err("%s: no device data for PCI\n", pci_name(pdev)); | ||
4993 | return; | ||
4994 | } | ||
4995 | skd_stop_device(skdev); | ||
4996 | skd_release_irq(skdev); | ||
4997 | |||
4998 | for (i = 0; i < SKD_MAX_BARS; i++) | ||
4999 | if (skdev->mem_map[i]) | ||
5000 | iounmap((u32 *)skdev->mem_map[i]); | ||
5001 | |||
5002 | if (skdev->pcie_error_reporting_is_enabled) | ||
5003 | pci_disable_pcie_error_reporting(pdev); | ||
5004 | |||
5005 | skd_destruct(skdev); | ||
5006 | |||
5007 | pci_release_regions(pdev); | ||
5008 | pci_disable_device(pdev); | ||
5009 | pci_set_drvdata(pdev, NULL); | ||
5010 | |||
5011 | return; | ||
5012 | } | ||
5013 | |||
5014 | static int skd_pci_suspend(struct pci_dev *pdev, pm_message_t state) | ||
5015 | { | ||
5016 | int i; | ||
5017 | struct skd_device *skdev; | ||
5018 | |||
5019 | skdev = pci_get_drvdata(pdev); | ||
5020 | if (!skdev) { | ||
5021 | pr_err("%s: no device data for PCI\n", pci_name(pdev)); | ||
5022 | return -EIO; | ||
5023 | } | ||
5024 | |||
5025 | skd_stop_device(skdev); | ||
5026 | |||
5027 | skd_release_irq(skdev); | ||
5028 | |||
5029 | for (i = 0; i < SKD_MAX_BARS; i++) | ||
5030 | if (skdev->mem_map[i]) | ||
5031 | iounmap((u32 *)skdev->mem_map[i]); | ||
5032 | |||
5033 | if (skdev->pcie_error_reporting_is_enabled) | ||
5034 | pci_disable_pcie_error_reporting(pdev); | ||
5035 | |||
5036 | pci_release_regions(pdev); | ||
5037 | pci_save_state(pdev); | ||
5038 | pci_disable_device(pdev); | ||
5039 | pci_set_power_state(pdev, pci_choose_state(pdev, state)); | ||
5040 | return 0; | ||
5041 | } | ||
5042 | |||
5043 | static int skd_pci_resume(struct pci_dev *pdev) | ||
5044 | { | ||
5045 | int i; | ||
5046 | int rc = 0; | ||
5047 | struct skd_device *skdev; | ||
5048 | |||
5049 | skdev = pci_get_drvdata(pdev); | ||
5050 | if (!skdev) { | ||
5051 | pr_err("%s: no device data for PCI\n", pci_name(pdev)); | ||
5052 | return -1; | ||
5053 | } | ||
5054 | |||
5055 | pci_set_power_state(pdev, PCI_D0); | ||
5056 | pci_enable_wake(pdev, PCI_D0, 0); | ||
5057 | pci_restore_state(pdev); | ||
5058 | |||
5059 | rc = pci_enable_device(pdev); | ||
5060 | if (rc) | ||
5061 | return rc; | ||
5062 | rc = pci_request_regions(pdev, DRV_NAME); | ||
5063 | if (rc) | ||
5064 | goto err_out; | ||
5065 | rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); | ||
5066 | if (!rc) { | ||
5067 | if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) { | ||
5068 | |||
5069 | pr_err("(%s): consistent DMA mask error %d\n", | ||
5070 | pci_name(pdev), rc); | ||
5071 | } | ||
5072 | } else { | ||
5073 | rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); | ||
5074 | if (rc) { | ||
5075 | |||
5076 | pr_err("(%s): DMA mask error %d\n", | ||
5077 | pci_name(pdev), rc); | ||
5078 | goto err_out_regions; | ||
5079 | } | ||
5080 | } | ||
5081 | |||
5082 | pci_set_master(pdev); | ||
5083 | rc = pci_enable_pcie_error_reporting(pdev); | ||
5084 | if (rc) { | ||
5085 | pr_err("(%s): bad enable of PCIe error reporting rc=%d\n", | ||
5086 | skdev->name, rc); | ||
5087 | skdev->pcie_error_reporting_is_enabled = 0; | ||
5088 | } else | ||
5089 | skdev->pcie_error_reporting_is_enabled = 1; | ||
5090 | |||
5091 | for (i = 0; i < SKD_MAX_BARS; i++) { | ||
5092 | |||
5093 | skdev->mem_phys[i] = pci_resource_start(pdev, i); | ||
5094 | skdev->mem_size[i] = (u32)pci_resource_len(pdev, i); | ||
5095 | skdev->mem_map[i] = ioremap(skdev->mem_phys[i], | ||
5096 | skdev->mem_size[i]); | ||
5097 | if (!skdev->mem_map[i]) { | ||
5098 | pr_err("(%s): Unable to map adapter memory!\n", | ||
5099 | skd_name(skdev)); | ||
5100 | rc = -ENODEV; | ||
5101 | goto err_out_iounmap; | ||
5102 | } | ||
5103 | pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n", | ||
5104 | skdev->name, __func__, __LINE__, | ||
5105 | skdev->mem_map[i], | ||
5106 | (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]); | ||
5107 | } | ||
5108 | rc = skd_acquire_irq(skdev); | ||
5109 | if (rc) { | ||
5110 | |||
5111 | pr_err("(%s): interrupt resource error %d\n", | ||
5112 | pci_name(pdev), rc); | ||
5113 | goto err_out_iounmap; | ||
5114 | } | ||
5115 | |||
5116 | rc = skd_start_timer(skdev); | ||
5117 | if (rc) | ||
5118 | goto err_out_timer; | ||
5119 | |||
5120 | init_waitqueue_head(&skdev->waitq); | ||
5121 | |||
5122 | skd_start_device(skdev); | ||
5123 | |||
5124 | return rc; | ||
5125 | |||
5126 | err_out_timer: | ||
5127 | skd_stop_device(skdev); | ||
5128 | skd_release_irq(skdev); | ||
5129 | |||
5130 | err_out_iounmap: | ||
5131 | for (i = 0; i < SKD_MAX_BARS; i++) | ||
5132 | if (skdev->mem_map[i]) | ||
5133 | iounmap(skdev->mem_map[i]); | ||
5134 | |||
5135 | if (skdev->pcie_error_reporting_is_enabled) | ||
5136 | pci_disable_pcie_error_reporting(pdev); | ||
5137 | |||
5138 | err_out_regions: | ||
5139 | pci_release_regions(pdev); | ||
5140 | |||
5141 | err_out: | ||
5142 | pci_disable_device(pdev); | ||
5143 | return rc; | ||
5144 | } | ||
5145 | |||
5146 | static void skd_pci_shutdown(struct pci_dev *pdev) | ||
5147 | { | ||
5148 | struct skd_device *skdev; | ||
5149 | |||
5150 | pr_err("skd_pci_shutdown called\n"); | ||
5151 | |||
5152 | skdev = pci_get_drvdata(pdev); | ||
5153 | if (!skdev) { | ||
5154 | pr_err("%s: no device data for PCI\n", pci_name(pdev)); | ||
5155 | return; | ||
5156 | } | ||
5157 | |||
5158 | pr_err("%s: calling stop\n", skd_name(skdev)); | ||
5159 | skd_stop_device(skdev); | ||
5160 | } | ||
5161 | |||
5162 | static struct pci_driver skd_driver = { | ||
5163 | .name = DRV_NAME, | ||
5164 | .id_table = skd_pci_tbl, | ||
5165 | .probe = skd_pci_probe, | ||
5166 | .remove = skd_pci_remove, | ||
5167 | .suspend = skd_pci_suspend, | ||
5168 | .resume = skd_pci_resume, | ||
5169 | .shutdown = skd_pci_shutdown, | ||
5170 | }; | ||
5171 | |||
5172 | /* | ||
5173 | ***************************************************************************** | ||
5174 | * LOGGING SUPPORT | ||
5175 | ***************************************************************************** | ||
5176 | */ | ||
5177 | |||
5178 | static const char *skd_name(struct skd_device *skdev) | ||
5179 | { | ||
5180 | memset(skdev->id_str, 0, sizeof(skdev->id_str)); | ||
5181 | |||
5182 | if (skdev->inquiry_is_valid) | ||
5183 | snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:%s:[%s]", | ||
5184 | skdev->name, skdev->inq_serial_num, | ||
5185 | pci_name(skdev->pdev)); | ||
5186 | else | ||
5187 | snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:??:[%s]", | ||
5188 | skdev->name, pci_name(skdev->pdev)); | ||
5189 | |||
5190 | return skdev->id_str; | ||
5191 | } | ||
5192 | |||
5193 | const char *skd_drive_state_to_str(int state) | ||
5194 | { | ||
5195 | switch (state) { | ||
5196 | case FIT_SR_DRIVE_OFFLINE: | ||
5197 | return "OFFLINE"; | ||
5198 | case FIT_SR_DRIVE_INIT: | ||
5199 | return "INIT"; | ||
5200 | case FIT_SR_DRIVE_ONLINE: | ||
5201 | return "ONLINE"; | ||
5202 | case FIT_SR_DRIVE_BUSY: | ||
5203 | return "BUSY"; | ||
5204 | case FIT_SR_DRIVE_FAULT: | ||
5205 | return "FAULT"; | ||
5206 | case FIT_SR_DRIVE_DEGRADED: | ||
5207 | return "DEGRADED"; | ||
5208 | case FIT_SR_PCIE_LINK_DOWN: | ||
5209 | return "INK_DOWN"; | ||
5210 | case FIT_SR_DRIVE_SOFT_RESET: | ||
5211 | return "SOFT_RESET"; | ||
5212 | case FIT_SR_DRIVE_NEED_FW_DOWNLOAD: | ||
5213 | return "NEED_FW"; | ||
5214 | case FIT_SR_DRIVE_INIT_FAULT: | ||
5215 | return "INIT_FAULT"; | ||
5216 | case FIT_SR_DRIVE_BUSY_SANITIZE: | ||
5217 | return "BUSY_SANITIZE"; | ||
5218 | case FIT_SR_DRIVE_BUSY_ERASE: | ||
5219 | return "BUSY_ERASE"; | ||
5220 | case FIT_SR_DRIVE_FW_BOOTING: | ||
5221 | return "FW_BOOTING"; | ||
5222 | default: | ||
5223 | return "???"; | ||
5224 | } | ||
5225 | } | ||
5226 | |||
5227 | const char *skd_skdev_state_to_str(enum skd_drvr_state state) | ||
5228 | { | ||
5229 | switch (state) { | ||
5230 | case SKD_DRVR_STATE_LOAD: | ||
5231 | return "LOAD"; | ||
5232 | case SKD_DRVR_STATE_IDLE: | ||
5233 | return "IDLE"; | ||
5234 | case SKD_DRVR_STATE_BUSY: | ||
5235 | return "BUSY"; | ||
5236 | case SKD_DRVR_STATE_STARTING: | ||
5237 | return "STARTING"; | ||
5238 | case SKD_DRVR_STATE_ONLINE: | ||
5239 | return "ONLINE"; | ||
5240 | case SKD_DRVR_STATE_PAUSING: | ||
5241 | return "PAUSING"; | ||
5242 | case SKD_DRVR_STATE_PAUSED: | ||
5243 | return "PAUSED"; | ||
5244 | case SKD_DRVR_STATE_DRAINING_TIMEOUT: | ||
5245 | return "DRAINING_TIMEOUT"; | ||
5246 | case SKD_DRVR_STATE_RESTARTING: | ||
5247 | return "RESTARTING"; | ||
5248 | case SKD_DRVR_STATE_RESUMING: | ||
5249 | return "RESUMING"; | ||
5250 | case SKD_DRVR_STATE_STOPPING: | ||
5251 | return "STOPPING"; | ||
5252 | case SKD_DRVR_STATE_SYNCING: | ||
5253 | return "SYNCING"; | ||
5254 | case SKD_DRVR_STATE_FAULT: | ||
5255 | return "FAULT"; | ||
5256 | case SKD_DRVR_STATE_DISAPPEARED: | ||
5257 | return "DISAPPEARED"; | ||
5258 | case SKD_DRVR_STATE_BUSY_ERASE: | ||
5259 | return "BUSY_ERASE"; | ||
5260 | case SKD_DRVR_STATE_BUSY_SANITIZE: | ||
5261 | return "BUSY_SANITIZE"; | ||
5262 | case SKD_DRVR_STATE_BUSY_IMMINENT: | ||
5263 | return "BUSY_IMMINENT"; | ||
5264 | case SKD_DRVR_STATE_WAIT_BOOT: | ||
5265 | return "WAIT_BOOT"; | ||
5266 | |||
5267 | default: | ||
5268 | return "???"; | ||
5269 | } | ||
5270 | } | ||
5271 | |||
5272 | const char *skd_skmsg_state_to_str(enum skd_fit_msg_state state) | ||
5273 | { | ||
5274 | switch (state) { | ||
5275 | case SKD_MSG_STATE_IDLE: | ||
5276 | return "IDLE"; | ||
5277 | case SKD_MSG_STATE_BUSY: | ||
5278 | return "BUSY"; | ||
5279 | default: | ||
5280 | return "???"; | ||
5281 | } | ||
5282 | } | ||
5283 | |||
5284 | const char *skd_skreq_state_to_str(enum skd_req_state state) | ||
5285 | { | ||
5286 | switch (state) { | ||
5287 | case SKD_REQ_STATE_IDLE: | ||
5288 | return "IDLE"; | ||
5289 | case SKD_REQ_STATE_SETUP: | ||
5290 | return "SETUP"; | ||
5291 | case SKD_REQ_STATE_BUSY: | ||
5292 | return "BUSY"; | ||
5293 | case SKD_REQ_STATE_COMPLETED: | ||
5294 | return "COMPLETED"; | ||
5295 | case SKD_REQ_STATE_TIMEOUT: | ||
5296 | return "TIMEOUT"; | ||
5297 | case SKD_REQ_STATE_ABORTED: | ||
5298 | return "ABORTED"; | ||
5299 | default: | ||
5300 | return "???"; | ||
5301 | } | ||
5302 | } | ||
5303 | |||
5304 | static void skd_log_skdev(struct skd_device *skdev, const char *event) | ||
5305 | { | ||
5306 | pr_debug("%s:%s:%d (%s) skdev=%p event='%s'\n", | ||
5307 | skdev->name, __func__, __LINE__, skdev->name, skdev, event); | ||
5308 | pr_debug("%s:%s:%d drive_state=%s(%d) driver_state=%s(%d)\n", | ||
5309 | skdev->name, __func__, __LINE__, | ||
5310 | skd_drive_state_to_str(skdev->drive_state), skdev->drive_state, | ||
5311 | skd_skdev_state_to_str(skdev->state), skdev->state); | ||
5312 | pr_debug("%s:%s:%d busy=%d limit=%d dev=%d lowat=%d\n", | ||
5313 | skdev->name, __func__, __LINE__, | ||
5314 | skdev->in_flight, skdev->cur_max_queue_depth, | ||
5315 | skdev->dev_max_queue_depth, skdev->queue_low_water_mark); | ||
5316 | pr_debug("%s:%s:%d timestamp=0x%x cycle=%d cycle_ix=%d\n", | ||
5317 | skdev->name, __func__, __LINE__, | ||
5318 | skdev->timeout_stamp, skdev->skcomp_cycle, skdev->skcomp_ix); | ||
5319 | } | ||
5320 | |||
5321 | static void skd_log_skmsg(struct skd_device *skdev, | ||
5322 | struct skd_fitmsg_context *skmsg, const char *event) | ||
5323 | { | ||
5324 | pr_debug("%s:%s:%d (%s) skmsg=%p event='%s'\n", | ||
5325 | skdev->name, __func__, __LINE__, skdev->name, skmsg, event); | ||
5326 | pr_debug("%s:%s:%d state=%s(%d) id=0x%04x length=%d\n", | ||
5327 | skdev->name, __func__, __LINE__, | ||
5328 | skd_skmsg_state_to_str(skmsg->state), skmsg->state, | ||
5329 | skmsg->id, skmsg->length); | ||
5330 | } | ||
5331 | |||
5332 | static void skd_log_skreq(struct skd_device *skdev, | ||
5333 | struct skd_request_context *skreq, const char *event) | ||
5334 | { | ||
5335 | pr_debug("%s:%s:%d (%s) skreq=%p event='%s'\n", | ||
5336 | skdev->name, __func__, __LINE__, skdev->name, skreq, event); | ||
5337 | pr_debug("%s:%s:%d state=%s(%d) id=0x%04x fitmsg=0x%04x\n", | ||
5338 | skdev->name, __func__, __LINE__, | ||
5339 | skd_skreq_state_to_str(skreq->state), skreq->state, | ||
5340 | skreq->id, skreq->fitmsg_id); | ||
5341 | pr_debug("%s:%s:%d timo=0x%x sg_dir=%d n_sg=%d\n", | ||
5342 | skdev->name, __func__, __LINE__, | ||
5343 | skreq->timeout_stamp, skreq->sg_data_dir, skreq->n_sg); | ||
5344 | |||
5345 | if (skreq->req != NULL) { | ||
5346 | struct request *req = skreq->req; | ||
5347 | u32 lba = (u32)blk_rq_pos(req); | ||
5348 | u32 count = blk_rq_sectors(req); | ||
5349 | |||
5350 | pr_debug("%s:%s:%d " | ||
5351 | "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", | ||
5352 | skdev->name, __func__, __LINE__, | ||
5353 | req, lba, lba, count, count, | ||
5354 | (int)rq_data_dir(req)); | ||
5355 | } else | ||
5356 | pr_debug("%s:%s:%d req=NULL\n", | ||
5357 | skdev->name, __func__, __LINE__); | ||
5358 | } | ||
5359 | |||
5360 | /* | ||
5361 | ***************************************************************************** | ||
5362 | * MODULE GLUE | ||
5363 | ***************************************************************************** | ||
5364 | */ | ||
5365 | |||
5366 | static int __init skd_init(void) | ||
5367 | { | ||
5368 | pr_info(PFX " v%s-b%s loaded\n", DRV_VERSION, DRV_BUILD_ID); | ||
5369 | |||
5370 | switch (skd_isr_type) { | ||
5371 | case SKD_IRQ_LEGACY: | ||
5372 | case SKD_IRQ_MSI: | ||
5373 | case SKD_IRQ_MSIX: | ||
5374 | break; | ||
5375 | default: | ||
5376 | pr_err(PFX "skd_isr_type %d invalid, re-set to %d\n", | ||
5377 | skd_isr_type, SKD_IRQ_DEFAULT); | ||
5378 | skd_isr_type = SKD_IRQ_DEFAULT; | ||
5379 | } | ||
5380 | |||
5381 | if (skd_max_queue_depth < 1 || | ||
5382 | skd_max_queue_depth > SKD_MAX_QUEUE_DEPTH) { | ||
5383 | pr_err(PFX "skd_max_queue_depth %d invalid, re-set to %d\n", | ||
5384 | skd_max_queue_depth, SKD_MAX_QUEUE_DEPTH_DEFAULT); | ||
5385 | skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT; | ||
5386 | } | ||
5387 | |||
5388 | if (skd_max_req_per_msg < 1 || skd_max_req_per_msg > 14) { | ||
5389 | pr_err(PFX "skd_max_req_per_msg %d invalid, re-set to %d\n", | ||
5390 | skd_max_req_per_msg, SKD_MAX_REQ_PER_MSG_DEFAULT); | ||
5391 | skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT; | ||
5392 | } | ||
5393 | |||
5394 | if (skd_sgs_per_request < 1 || skd_sgs_per_request > 4096) { | ||
5395 | pr_err(PFX "skd_sg_per_request %d invalid, re-set to %d\n", | ||
5396 | skd_sgs_per_request, SKD_N_SG_PER_REQ_DEFAULT); | ||
5397 | skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT; | ||
5398 | } | ||
5399 | |||
5400 | if (skd_dbg_level < 0 || skd_dbg_level > 2) { | ||
5401 | pr_err(PFX "skd_dbg_level %d invalid, re-set to %d\n", | ||
5402 | skd_dbg_level, 0); | ||
5403 | skd_dbg_level = 0; | ||
5404 | } | ||
5405 | |||
5406 | if (skd_isr_comp_limit < 0) { | ||
5407 | pr_err(PFX "skd_isr_comp_limit %d invalid, set to %d\n", | ||
5408 | skd_isr_comp_limit, 0); | ||
5409 | skd_isr_comp_limit = 0; | ||
5410 | } | ||
5411 | |||
5412 | if (skd_max_pass_thru < 1 || skd_max_pass_thru > 50) { | ||
5413 | pr_err(PFX "skd_max_pass_thru %d invalid, re-set to %d\n", | ||
5414 | skd_max_pass_thru, SKD_N_SPECIAL_CONTEXT); | ||
5415 | skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT; | ||
5416 | } | ||
5417 | |||
5418 | return pci_register_driver(&skd_driver); | ||
5419 | } | ||
5420 | |||
5421 | static void __exit skd_exit(void) | ||
5422 | { | ||
5423 | pr_info(PFX " v%s-b%s unloading\n", DRV_VERSION, DRV_BUILD_ID); | ||
5424 | |||
5425 | pci_unregister_driver(&skd_driver); | ||
5426 | |||
5427 | if (skd_major) | ||
5428 | unregister_blkdev(skd_major, DRV_NAME); | ||
5429 | } | ||
5430 | |||
5431 | module_init(skd_init); | ||
5432 | module_exit(skd_exit); | ||
diff --git a/drivers/block/skd_s1120.h b/drivers/block/skd_s1120.h new file mode 100644 index 000000000000..61c757ff0161 --- /dev/null +++ b/drivers/block/skd_s1120.h | |||
@@ -0,0 +1,330 @@ | |||
1 | /* Copyright 2012 STEC, Inc. | ||
2 | * | ||
3 | * This file is licensed under the terms of the 3-clause | ||
4 | * BSD License (http://opensource.org/licenses/BSD-3-Clause) | ||
5 | * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html), | ||
6 | * at your option. Both licenses are also available in the LICENSE file | ||
7 | * distributed with this project. This file may not be copied, modified, | ||
8 | * or distributed except in accordance with those terms. | ||
9 | */ | ||
10 | |||
11 | |||
12 | #ifndef SKD_S1120_H | ||
13 | #define SKD_S1120_H | ||
14 | |||
15 | #pragma pack(push, s1120_h, 1) | ||
16 | |||
17 | /* | ||
18 | * Q-channel, 64-bit r/w | ||
19 | */ | ||
20 | #define FIT_Q_COMMAND 0x400u | ||
21 | #define FIT_QCMD_QID_MASK (0x3 << 1) | ||
22 | #define FIT_QCMD_QID0 (0x0 << 1) | ||
23 | #define FIT_QCMD_QID_NORMAL FIT_QCMD_QID0 | ||
24 | #define FIT_QCMD_QID1 (0x1 << 1) | ||
25 | #define FIT_QCMD_QID2 (0x2 << 1) | ||
26 | #define FIT_QCMD_QID3 (0x3 << 1) | ||
27 | #define FIT_QCMD_FLUSH_QUEUE (0ull) /* add QID */ | ||
28 | #define FIT_QCMD_MSGSIZE_MASK (0x3 << 4) | ||
29 | #define FIT_QCMD_MSGSIZE_64 (0x0 << 4) | ||
30 | #define FIT_QCMD_MSGSIZE_128 (0x1 << 4) | ||
31 | #define FIT_QCMD_MSGSIZE_256 (0x2 << 4) | ||
32 | #define FIT_QCMD_MSGSIZE_512 (0x3 << 4) | ||
33 | #define FIT_QCMD_BASE_ADDRESS_MASK (0xFFFFFFFFFFFFFFC0ull) | ||
34 | |||
35 | /* | ||
36 | * Control, 32-bit r/w | ||
37 | */ | ||
38 | #define FIT_CONTROL 0x500u | ||
39 | #define FIT_CR_HARD_RESET (1u << 0u) | ||
40 | #define FIT_CR_SOFT_RESET (1u << 1u) | ||
41 | #define FIT_CR_DIS_TIMESTAMPS (1u << 6u) | ||
42 | #define FIT_CR_ENABLE_INTERRUPTS (1u << 7u) | ||
43 | |||
44 | /* | ||
45 | * Status, 32-bit, r/o | ||
46 | */ | ||
47 | #define FIT_STATUS 0x510u | ||
48 | #define FIT_SR_DRIVE_STATE_MASK 0x000000FFu | ||
49 | #define FIT_SR_SIGNATURE (0xFF << 8) | ||
50 | #define FIT_SR_PIO_DMA (1 << 16) | ||
51 | #define FIT_SR_DRIVE_OFFLINE 0x00 | ||
52 | #define FIT_SR_DRIVE_INIT 0x01 | ||
53 | /* #define FIT_SR_DRIVE_READY 0x02 */ | ||
54 | #define FIT_SR_DRIVE_ONLINE 0x03 | ||
55 | #define FIT_SR_DRIVE_BUSY 0x04 | ||
56 | #define FIT_SR_DRIVE_FAULT 0x05 | ||
57 | #define FIT_SR_DRIVE_DEGRADED 0x06 | ||
58 | #define FIT_SR_PCIE_LINK_DOWN 0x07 | ||
59 | #define FIT_SR_DRIVE_SOFT_RESET 0x08 | ||
60 | #define FIT_SR_DRIVE_INIT_FAULT 0x09 | ||
61 | #define FIT_SR_DRIVE_BUSY_SANITIZE 0x0A | ||
62 | #define FIT_SR_DRIVE_BUSY_ERASE 0x0B | ||
63 | #define FIT_SR_DRIVE_FW_BOOTING 0x0C | ||
64 | #define FIT_SR_DRIVE_NEED_FW_DOWNLOAD 0xFE | ||
65 | #define FIT_SR_DEVICE_MISSING 0xFF | ||
66 | #define FIT_SR__RESERVED 0xFFFFFF00u | ||
67 | |||
68 | /* | ||
69 | * FIT_STATUS - Status register data definition | ||
70 | */ | ||
71 | #define FIT_SR_STATE_MASK (0xFF << 0) | ||
72 | #define FIT_SR_SIGNATURE (0xFF << 8) | ||
73 | #define FIT_SR_PIO_DMA (1 << 16) | ||
74 | |||
75 | /* | ||
76 | * Interrupt status, 32-bit r/w1c (w1c ==> write 1 to clear) | ||
77 | */ | ||
78 | #define FIT_INT_STATUS_HOST 0x520u | ||
79 | #define FIT_ISH_FW_STATE_CHANGE (1u << 0u) | ||
80 | #define FIT_ISH_COMPLETION_POSTED (1u << 1u) | ||
81 | #define FIT_ISH_MSG_FROM_DEV (1u << 2u) | ||
82 | #define FIT_ISH_UNDEFINED_3 (1u << 3u) | ||
83 | #define FIT_ISH_UNDEFINED_4 (1u << 4u) | ||
84 | #define FIT_ISH_Q0_FULL (1u << 5u) | ||
85 | #define FIT_ISH_Q1_FULL (1u << 6u) | ||
86 | #define FIT_ISH_Q2_FULL (1u << 7u) | ||
87 | #define FIT_ISH_Q3_FULL (1u << 8u) | ||
88 | #define FIT_ISH_QCMD_FIFO_OVERRUN (1u << 9u) | ||
89 | #define FIT_ISH_BAD_EXP_ROM_READ (1u << 10u) | ||
90 | |||
91 | #define FIT_INT_DEF_MASK \ | ||
92 | (FIT_ISH_FW_STATE_CHANGE | \ | ||
93 | FIT_ISH_COMPLETION_POSTED | \ | ||
94 | FIT_ISH_MSG_FROM_DEV | \ | ||
95 | FIT_ISH_Q0_FULL | \ | ||
96 | FIT_ISH_Q1_FULL | \ | ||
97 | FIT_ISH_Q2_FULL | \ | ||
98 | FIT_ISH_Q3_FULL | \ | ||
99 | FIT_ISH_QCMD_FIFO_OVERRUN | \ | ||
100 | FIT_ISH_BAD_EXP_ROM_READ) | ||
101 | |||
102 | #define FIT_INT_QUEUE_FULL \ | ||
103 | (FIT_ISH_Q0_FULL | \ | ||
104 | FIT_ISH_Q1_FULL | \ | ||
105 | FIT_ISH_Q2_FULL | \ | ||
106 | FIT_ISH_Q3_FULL) | ||
107 | |||
108 | #define MSI_MSG_NWL_ERROR_0 0x00000000 | ||
109 | #define MSI_MSG_NWL_ERROR_1 0x00000001 | ||
110 | #define MSI_MSG_NWL_ERROR_2 0x00000002 | ||
111 | #define MSI_MSG_NWL_ERROR_3 0x00000003 | ||
112 | #define MSI_MSG_STATE_CHANGE 0x00000004 | ||
113 | #define MSI_MSG_COMPLETION_POSTED 0x00000005 | ||
114 | #define MSI_MSG_MSG_FROM_DEV 0x00000006 | ||
115 | #define MSI_MSG_RESERVED_0 0x00000007 | ||
116 | #define MSI_MSG_RESERVED_1 0x00000008 | ||
117 | #define MSI_MSG_QUEUE_0_FULL 0x00000009 | ||
118 | #define MSI_MSG_QUEUE_1_FULL 0x0000000A | ||
119 | #define MSI_MSG_QUEUE_2_FULL 0x0000000B | ||
120 | #define MSI_MSG_QUEUE_3_FULL 0x0000000C | ||
121 | |||
122 | #define FIT_INT_RESERVED_MASK \ | ||
123 | (FIT_ISH_UNDEFINED_3 | \ | ||
124 | FIT_ISH_UNDEFINED_4) | ||
125 | |||
126 | /* | ||
127 | * Interrupt mask, 32-bit r/w | ||
128 | * Bit definitions are the same as FIT_INT_STATUS_HOST | ||
129 | */ | ||
130 | #define FIT_INT_MASK_HOST 0x528u | ||
131 | |||
132 | /* | ||
133 | * Message to device, 32-bit r/w | ||
134 | */ | ||
135 | #define FIT_MSG_TO_DEVICE 0x540u | ||
136 | |||
137 | /* | ||
138 | * Message from device, 32-bit, r/o | ||
139 | */ | ||
140 | #define FIT_MSG_FROM_DEVICE 0x548u | ||
141 | |||
142 | /* | ||
143 | * 32-bit messages to/from device, composition/extraction macros | ||
144 | */ | ||
145 | #define FIT_MXD_CONS(TYPE, PARAM, DATA) \ | ||
146 | ((((TYPE) & 0xFFu) << 24u) | \ | ||
147 | (((PARAM) & 0xFFu) << 16u) | \ | ||
148 | (((DATA) & 0xFFFFu) << 0u)) | ||
149 | #define FIT_MXD_TYPE(MXD) (((MXD) >> 24u) & 0xFFu) | ||
150 | #define FIT_MXD_PARAM(MXD) (((MXD) >> 16u) & 0xFFu) | ||
151 | #define FIT_MXD_DATA(MXD) (((MXD) >> 0u) & 0xFFFFu) | ||
152 | |||
153 | /* | ||
154 | * Types of messages to/from device | ||
155 | */ | ||
156 | #define FIT_MTD_FITFW_INIT 0x01u | ||
157 | #define FIT_MTD_GET_CMDQ_DEPTH 0x02u | ||
158 | #define FIT_MTD_SET_COMPQ_DEPTH 0x03u | ||
159 | #define FIT_MTD_SET_COMPQ_ADDR 0x04u | ||
160 | #define FIT_MTD_ARM_QUEUE 0x05u | ||
161 | #define FIT_MTD_CMD_LOG_HOST_ID 0x07u | ||
162 | #define FIT_MTD_CMD_LOG_TIME_STAMP_LO 0x08u | ||
163 | #define FIT_MTD_CMD_LOG_TIME_STAMP_HI 0x09u | ||
164 | #define FIT_MFD_SMART_EXCEEDED 0x10u | ||
165 | #define FIT_MFD_POWER_DOWN 0x11u | ||
166 | #define FIT_MFD_OFFLINE 0x12u | ||
167 | #define FIT_MFD_ONLINE 0x13u | ||
168 | #define FIT_MFD_FW_RESTARTING 0x14u | ||
169 | #define FIT_MFD_PM_ACTIVE 0x15u | ||
170 | #define FIT_MFD_PM_STANDBY 0x16u | ||
171 | #define FIT_MFD_PM_SLEEP 0x17u | ||
172 | #define FIT_MFD_CMD_PROGRESS 0x18u | ||
173 | |||
174 | #define FIT_MTD_DEBUG 0xFEu | ||
175 | #define FIT_MFD_DEBUG 0xFFu | ||
176 | |||
177 | #define FIT_MFD_MASK (0xFFu) | ||
178 | #define FIT_MFD_DATA_MASK (0xFFu) | ||
179 | #define FIT_MFD_MSG(x) (((x) >> 24) & FIT_MFD_MASK) | ||
180 | #define FIT_MFD_DATA(x) ((x) & FIT_MFD_MASK) | ||
181 | |||
182 | /* | ||
183 | * Extra arg to FIT_MSG_TO_DEVICE, 64-bit r/w | ||
184 | * Used to set completion queue address (FIT_MTD_SET_COMPQ_ADDR) | ||
185 | * (was Response buffer in docs) | ||
186 | */ | ||
187 | #define FIT_MSG_TO_DEVICE_ARG 0x580u | ||
188 | |||
189 | /* | ||
190 | * Hardware (ASIC) version, 32-bit r/o | ||
191 | */ | ||
192 | #define FIT_HW_VERSION 0x588u | ||
193 | |||
194 | /* | ||
195 | * Scatter/gather list descriptor. | ||
196 | * 32-bytes and must be aligned on a 32-byte boundary. | ||
197 | * All fields are in little endian order. | ||
198 | */ | ||
199 | struct fit_sg_descriptor { | ||
200 | uint32_t control; | ||
201 | uint32_t byte_count; | ||
202 | uint64_t host_side_addr; | ||
203 | uint64_t dev_side_addr; | ||
204 | uint64_t next_desc_ptr; | ||
205 | }; | ||
206 | |||
207 | #define FIT_SGD_CONTROL_NOT_LAST 0x000u | ||
208 | #define FIT_SGD_CONTROL_LAST 0x40Eu | ||
209 | |||
210 | /* | ||
211 | * Header at the beginning of a FIT message. The header | ||
212 | * is followed by SSDI requests each 64 bytes. | ||
213 | * A FIT message can be up to 512 bytes long and must start | ||
214 | * on a 64-byte boundary. | ||
215 | */ | ||
216 | struct fit_msg_hdr { | ||
217 | uint8_t protocol_id; | ||
218 | uint8_t num_protocol_cmds_coalesced; | ||
219 | uint8_t _reserved[62]; | ||
220 | }; | ||
221 | |||
222 | #define FIT_PROTOCOL_ID_FIT 1 | ||
223 | #define FIT_PROTOCOL_ID_SSDI 2 | ||
224 | #define FIT_PROTOCOL_ID_SOFIT 3 | ||
225 | |||
226 | |||
227 | #define FIT_PROTOCOL_MINOR_VER(mtd_val) ((mtd_val >> 16) & 0xF) | ||
228 | #define FIT_PROTOCOL_MAJOR_VER(mtd_val) ((mtd_val >> 20) & 0xF) | ||
229 | |||
230 | /* | ||
231 | * Format of a completion entry. The completion queue is circular | ||
232 | * and must have at least as many entries as the maximum number | ||
233 | * of commands that may be issued to the device. | ||
234 | * | ||
235 | * There are no head/tail pointers. The cycle value is used to | ||
236 | * infer the presence of new completion records. | ||
237 | * Initially the cycle in all entries is 0, the index is 0, and | ||
238 | * the cycle value to expect is 1. When completions are added | ||
239 | * their cycle values are set to 1. When the index wraps the | ||
240 | * cycle value to expect is incremented. | ||
241 | * | ||
242 | * Command_context is opaque and taken verbatim from the SSDI command. | ||
243 | * All other fields are big endian. | ||
244 | */ | ||
245 | #define FIT_PROTOCOL_VERSION_0 0 | ||
246 | |||
247 | /* | ||
248 | * Protocol major version 1 completion entry. | ||
249 | * The major protocol version is found in bits | ||
250 | * 20-23 of the FIT_MTD_FITFW_INIT response. | ||
251 | */ | ||
252 | struct fit_completion_entry_v1 { | ||
253 | uint32_t num_returned_bytes; | ||
254 | uint16_t tag; | ||
255 | uint8_t status; /* SCSI status */ | ||
256 | uint8_t cycle; | ||
257 | }; | ||
258 | #define FIT_PROTOCOL_VERSION_1 1 | ||
259 | #define FIT_PROTOCOL_VERSION_CURRENT FIT_PROTOCOL_VERSION_1 | ||
260 | |||
261 | struct fit_comp_error_info { | ||
262 | uint8_t type:7; /* 00: Bits0-6 indicates the type of sense data. */ | ||
263 | uint8_t valid:1; /* 00: Bit 7 := 1 ==> info field is valid. */ | ||
264 | uint8_t reserved0; /* 01: Obsolete field */ | ||
265 | uint8_t key:4; /* 02: Bits0-3 indicate the sense key. */ | ||
266 | uint8_t reserved2:1; /* 02: Reserved bit. */ | ||
267 | uint8_t bad_length:1; /* 02: Incorrect Length Indicator */ | ||
268 | uint8_t end_medium:1; /* 02: End of Medium */ | ||
269 | uint8_t file_mark:1; /* 02: Filemark */ | ||
270 | uint8_t info[4]; /* 03: */ | ||
271 | uint8_t reserved1; /* 07: Additional Sense Length */ | ||
272 | uint8_t cmd_spec[4]; /* 08: Command Specific Information */ | ||
273 | uint8_t code; /* 0C: Additional Sense Code */ | ||
274 | uint8_t qual; /* 0D: Additional Sense Code Qualifier */ | ||
275 | uint8_t fruc; /* 0E: Field Replaceable Unit Code */ | ||
276 | uint8_t sks_high:7; /* 0F: Sense Key Specific (MSB) */ | ||
277 | uint8_t sks_valid:1; /* 0F: Sense Key Specific Valid */ | ||
278 | uint16_t sks_low; /* 10: Sense Key Specific (LSW) */ | ||
279 | uint16_t reserved3; /* 12: Part of additional sense bytes (unused) */ | ||
280 | uint16_t uec; /* 14: Additional Sense Bytes */ | ||
281 | uint64_t per; /* 16: Additional Sense Bytes */ | ||
282 | uint8_t reserved4[2]; /* 1E: Additional Sense Bytes (unused) */ | ||
283 | }; | ||
284 | |||
285 | |||
286 | /* Task management constants */ | ||
287 | #define SOFT_TASK_SIMPLE 0x00 | ||
288 | #define SOFT_TASK_HEAD_OF_QUEUE 0x01 | ||
289 | #define SOFT_TASK_ORDERED 0x02 | ||
290 | |||
291 | /* Version zero has the last 32 bits reserved, | ||
292 | * Version one has the last 32 bits sg_list_len_bytes; | ||
293 | */ | ||
294 | struct skd_command_header { | ||
295 | uint64_t sg_list_dma_address; | ||
296 | uint16_t tag; | ||
297 | uint8_t attribute; | ||
298 | uint8_t add_cdb_len; /* In 32 bit words */ | ||
299 | uint32_t sg_list_len_bytes; | ||
300 | }; | ||
301 | |||
302 | struct skd_scsi_request { | ||
303 | struct skd_command_header hdr; | ||
304 | unsigned char cdb[16]; | ||
305 | /* unsigned char _reserved[16]; */ | ||
306 | }; | ||
307 | |||
308 | struct driver_inquiry_data { | ||
309 | uint8_t peripheral_device_type:5; | ||
310 | uint8_t qualifier:3; | ||
311 | uint8_t page_code; | ||
312 | uint16_t page_length; | ||
313 | uint16_t pcie_bus_number; | ||
314 | uint8_t pcie_device_number; | ||
315 | uint8_t pcie_function_number; | ||
316 | uint8_t pcie_link_speed; | ||
317 | uint8_t pcie_link_lanes; | ||
318 | uint16_t pcie_vendor_id; | ||
319 | uint16_t pcie_device_id; | ||
320 | uint16_t pcie_subsystem_vendor_id; | ||
321 | uint16_t pcie_subsystem_device_id; | ||
322 | uint8_t reserved1[2]; | ||
323 | uint8_t reserved2[3]; | ||
324 | uint8_t driver_version_length; | ||
325 | uint8_t driver_version[0x14]; | ||
326 | }; | ||
327 | |||
328 | #pragma pack(pop, s1120_h) | ||
329 | |||
330 | #endif /* SKD_S1120_H */ | ||
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index bf4b9d282c04..6620b73d0490 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c | |||
@@ -887,6 +887,8 @@ static int dispatch_discard_io(struct xen_blkif *blkif, | |||
887 | unsigned long secure; | 887 | unsigned long secure; |
888 | struct phys_req preq; | 888 | struct phys_req preq; |
889 | 889 | ||
890 | xen_blkif_get(blkif); | ||
891 | |||
890 | preq.sector_number = req->u.discard.sector_number; | 892 | preq.sector_number = req->u.discard.sector_number; |
891 | preq.nr_sects = req->u.discard.nr_sectors; | 893 | preq.nr_sects = req->u.discard.nr_sectors; |
892 | 894 | ||
@@ -899,7 +901,6 @@ static int dispatch_discard_io(struct xen_blkif *blkif, | |||
899 | } | 901 | } |
900 | blkif->st_ds_req++; | 902 | blkif->st_ds_req++; |
901 | 903 | ||
902 | xen_blkif_get(blkif); | ||
903 | secure = (blkif->vbd.discard_secure && | 904 | secure = (blkif->vbd.discard_secure && |
904 | (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? | 905 | (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? |
905 | BLKDEV_DISCARD_SECURE : 0; | 906 | BLKDEV_DISCARD_SECURE : 0; |
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 8d53ed293606..432db1b59b00 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c | |||
@@ -121,7 +121,8 @@ struct blkfront_info | |||
121 | struct work_struct work; | 121 | struct work_struct work; |
122 | struct gnttab_free_callback callback; | 122 | struct gnttab_free_callback callback; |
123 | struct blk_shadow shadow[BLK_RING_SIZE]; | 123 | struct blk_shadow shadow[BLK_RING_SIZE]; |
124 | struct list_head persistent_gnts; | 124 | struct list_head grants; |
125 | struct list_head indirect_pages; | ||
125 | unsigned int persistent_gnts_c; | 126 | unsigned int persistent_gnts_c; |
126 | unsigned long shadow_free; | 127 | unsigned long shadow_free; |
127 | unsigned int feature_flush; | 128 | unsigned int feature_flush; |
@@ -200,15 +201,17 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) | |||
200 | if (!gnt_list_entry) | 201 | if (!gnt_list_entry) |
201 | goto out_of_memory; | 202 | goto out_of_memory; |
202 | 203 | ||
203 | granted_page = alloc_page(GFP_NOIO); | 204 | if (info->feature_persistent) { |
204 | if (!granted_page) { | 205 | granted_page = alloc_page(GFP_NOIO); |
205 | kfree(gnt_list_entry); | 206 | if (!granted_page) { |
206 | goto out_of_memory; | 207 | kfree(gnt_list_entry); |
208 | goto out_of_memory; | ||
209 | } | ||
210 | gnt_list_entry->pfn = page_to_pfn(granted_page); | ||
207 | } | 211 | } |
208 | 212 | ||
209 | gnt_list_entry->pfn = page_to_pfn(granted_page); | ||
210 | gnt_list_entry->gref = GRANT_INVALID_REF; | 213 | gnt_list_entry->gref = GRANT_INVALID_REF; |
211 | list_add(&gnt_list_entry->node, &info->persistent_gnts); | 214 | list_add(&gnt_list_entry->node, &info->grants); |
212 | i++; | 215 | i++; |
213 | } | 216 | } |
214 | 217 | ||
@@ -216,9 +219,10 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) | |||
216 | 219 | ||
217 | out_of_memory: | 220 | out_of_memory: |
218 | list_for_each_entry_safe(gnt_list_entry, n, | 221 | list_for_each_entry_safe(gnt_list_entry, n, |
219 | &info->persistent_gnts, node) { | 222 | &info->grants, node) { |
220 | list_del(&gnt_list_entry->node); | 223 | list_del(&gnt_list_entry->node); |
221 | __free_page(pfn_to_page(gnt_list_entry->pfn)); | 224 | if (info->feature_persistent) |
225 | __free_page(pfn_to_page(gnt_list_entry->pfn)); | ||
222 | kfree(gnt_list_entry); | 226 | kfree(gnt_list_entry); |
223 | i--; | 227 | i--; |
224 | } | 228 | } |
@@ -227,13 +231,14 @@ out_of_memory: | |||
227 | } | 231 | } |
228 | 232 | ||
229 | static struct grant *get_grant(grant_ref_t *gref_head, | 233 | static struct grant *get_grant(grant_ref_t *gref_head, |
234 | unsigned long pfn, | ||
230 | struct blkfront_info *info) | 235 | struct blkfront_info *info) |
231 | { | 236 | { |
232 | struct grant *gnt_list_entry; | 237 | struct grant *gnt_list_entry; |
233 | unsigned long buffer_mfn; | 238 | unsigned long buffer_mfn; |
234 | 239 | ||
235 | BUG_ON(list_empty(&info->persistent_gnts)); | 240 | BUG_ON(list_empty(&info->grants)); |
236 | gnt_list_entry = list_first_entry(&info->persistent_gnts, struct grant, | 241 | gnt_list_entry = list_first_entry(&info->grants, struct grant, |
237 | node); | 242 | node); |
238 | list_del(&gnt_list_entry->node); | 243 | list_del(&gnt_list_entry->node); |
239 | 244 | ||
@@ -245,6 +250,10 @@ static struct grant *get_grant(grant_ref_t *gref_head, | |||
245 | /* Assign a gref to this page */ | 250 | /* Assign a gref to this page */ |
246 | gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); | 251 | gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); |
247 | BUG_ON(gnt_list_entry->gref == -ENOSPC); | 252 | BUG_ON(gnt_list_entry->gref == -ENOSPC); |
253 | if (!info->feature_persistent) { | ||
254 | BUG_ON(!pfn); | ||
255 | gnt_list_entry->pfn = pfn; | ||
256 | } | ||
248 | buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); | 257 | buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); |
249 | gnttab_grant_foreign_access_ref(gnt_list_entry->gref, | 258 | gnttab_grant_foreign_access_ref(gnt_list_entry->gref, |
250 | info->xbdev->otherend_id, | 259 | info->xbdev->otherend_id, |
@@ -400,10 +409,13 @@ static int blkif_queue_request(struct request *req) | |||
400 | if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) | 409 | if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) |
401 | return 1; | 410 | return 1; |
402 | 411 | ||
403 | max_grefs = info->max_indirect_segments ? | 412 | max_grefs = req->nr_phys_segments; |
404 | info->max_indirect_segments + | 413 | if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST) |
405 | INDIRECT_GREFS(info->max_indirect_segments) : | 414 | /* |
406 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | 415 | * If we are using indirect segments we need to account |
416 | * for the indirect grefs used in the request. | ||
417 | */ | ||
418 | max_grefs += INDIRECT_GREFS(req->nr_phys_segments); | ||
407 | 419 | ||
408 | /* Check if we have enough grants to allocate a requests */ | 420 | /* Check if we have enough grants to allocate a requests */ |
409 | if (info->persistent_gnts_c < max_grefs) { | 421 | if (info->persistent_gnts_c < max_grefs) { |
@@ -477,22 +489,34 @@ static int blkif_queue_request(struct request *req) | |||
477 | 489 | ||
478 | if ((ring_req->operation == BLKIF_OP_INDIRECT) && | 490 | if ((ring_req->operation == BLKIF_OP_INDIRECT) && |
479 | (i % SEGS_PER_INDIRECT_FRAME == 0)) { | 491 | (i % SEGS_PER_INDIRECT_FRAME == 0)) { |
492 | unsigned long pfn; | ||
493 | |||
480 | if (segments) | 494 | if (segments) |
481 | kunmap_atomic(segments); | 495 | kunmap_atomic(segments); |
482 | 496 | ||
483 | n = i / SEGS_PER_INDIRECT_FRAME; | 497 | n = i / SEGS_PER_INDIRECT_FRAME; |
484 | gnt_list_entry = get_grant(&gref_head, info); | 498 | if (!info->feature_persistent) { |
499 | struct page *indirect_page; | ||
500 | |||
501 | /* Fetch a pre-allocated page to use for indirect grefs */ | ||
502 | BUG_ON(list_empty(&info->indirect_pages)); | ||
503 | indirect_page = list_first_entry(&info->indirect_pages, | ||
504 | struct page, lru); | ||
505 | list_del(&indirect_page->lru); | ||
506 | pfn = page_to_pfn(indirect_page); | ||
507 | } | ||
508 | gnt_list_entry = get_grant(&gref_head, pfn, info); | ||
485 | info->shadow[id].indirect_grants[n] = gnt_list_entry; | 509 | info->shadow[id].indirect_grants[n] = gnt_list_entry; |
486 | segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); | 510 | segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); |
487 | ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; | 511 | ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; |
488 | } | 512 | } |
489 | 513 | ||
490 | gnt_list_entry = get_grant(&gref_head, info); | 514 | gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info); |
491 | ref = gnt_list_entry->gref; | 515 | ref = gnt_list_entry->gref; |
492 | 516 | ||
493 | info->shadow[id].grants_used[i] = gnt_list_entry; | 517 | info->shadow[id].grants_used[i] = gnt_list_entry; |
494 | 518 | ||
495 | if (rq_data_dir(req)) { | 519 | if (rq_data_dir(req) && info->feature_persistent) { |
496 | char *bvec_data; | 520 | char *bvec_data; |
497 | void *shared_data; | 521 | void *shared_data; |
498 | 522 | ||
@@ -904,21 +928,36 @@ static void blkif_free(struct blkfront_info *info, int suspend) | |||
904 | blk_stop_queue(info->rq); | 928 | blk_stop_queue(info->rq); |
905 | 929 | ||
906 | /* Remove all persistent grants */ | 930 | /* Remove all persistent grants */ |
907 | if (!list_empty(&info->persistent_gnts)) { | 931 | if (!list_empty(&info->grants)) { |
908 | list_for_each_entry_safe(persistent_gnt, n, | 932 | list_for_each_entry_safe(persistent_gnt, n, |
909 | &info->persistent_gnts, node) { | 933 | &info->grants, node) { |
910 | list_del(&persistent_gnt->node); | 934 | list_del(&persistent_gnt->node); |
911 | if (persistent_gnt->gref != GRANT_INVALID_REF) { | 935 | if (persistent_gnt->gref != GRANT_INVALID_REF) { |
912 | gnttab_end_foreign_access(persistent_gnt->gref, | 936 | gnttab_end_foreign_access(persistent_gnt->gref, |
913 | 0, 0UL); | 937 | 0, 0UL); |
914 | info->persistent_gnts_c--; | 938 | info->persistent_gnts_c--; |
915 | } | 939 | } |
916 | __free_page(pfn_to_page(persistent_gnt->pfn)); | 940 | if (info->feature_persistent) |
941 | __free_page(pfn_to_page(persistent_gnt->pfn)); | ||
917 | kfree(persistent_gnt); | 942 | kfree(persistent_gnt); |
918 | } | 943 | } |
919 | } | 944 | } |
920 | BUG_ON(info->persistent_gnts_c != 0); | 945 | BUG_ON(info->persistent_gnts_c != 0); |
921 | 946 | ||
947 | /* | ||
948 | * Remove indirect pages, this only happens when using indirect | ||
949 | * descriptors but not persistent grants | ||
950 | */ | ||
951 | if (!list_empty(&info->indirect_pages)) { | ||
952 | struct page *indirect_page, *n; | ||
953 | |||
954 | BUG_ON(info->feature_persistent); | ||
955 | list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { | ||
956 | list_del(&indirect_page->lru); | ||
957 | __free_page(indirect_page); | ||
958 | } | ||
959 | } | ||
960 | |||
922 | for (i = 0; i < BLK_RING_SIZE; i++) { | 961 | for (i = 0; i < BLK_RING_SIZE; i++) { |
923 | /* | 962 | /* |
924 | * Clear persistent grants present in requests already | 963 | * Clear persistent grants present in requests already |
@@ -933,7 +972,8 @@ static void blkif_free(struct blkfront_info *info, int suspend) | |||
933 | for (j = 0; j < segs; j++) { | 972 | for (j = 0; j < segs; j++) { |
934 | persistent_gnt = info->shadow[i].grants_used[j]; | 973 | persistent_gnt = info->shadow[i].grants_used[j]; |
935 | gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); | 974 | gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); |
936 | __free_page(pfn_to_page(persistent_gnt->pfn)); | 975 | if (info->feature_persistent) |
976 | __free_page(pfn_to_page(persistent_gnt->pfn)); | ||
937 | kfree(persistent_gnt); | 977 | kfree(persistent_gnt); |
938 | } | 978 | } |
939 | 979 | ||
@@ -992,7 +1032,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, | |||
992 | nseg = s->req.operation == BLKIF_OP_INDIRECT ? | 1032 | nseg = s->req.operation == BLKIF_OP_INDIRECT ? |
993 | s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; | 1033 | s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; |
994 | 1034 | ||
995 | if (bret->operation == BLKIF_OP_READ) { | 1035 | if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { |
996 | /* | 1036 | /* |
997 | * Copy the data received from the backend into the bvec. | 1037 | * Copy the data received from the backend into the bvec. |
998 | * Since bv_offset can be different than 0, and bv_len different | 1038 | * Since bv_offset can be different than 0, and bv_len different |
@@ -1013,13 +1053,51 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, | |||
1013 | } | 1053 | } |
1014 | /* Add the persistent grant into the list of free grants */ | 1054 | /* Add the persistent grant into the list of free grants */ |
1015 | for (i = 0; i < nseg; i++) { | 1055 | for (i = 0; i < nseg; i++) { |
1016 | list_add(&s->grants_used[i]->node, &info->persistent_gnts); | 1056 | if (gnttab_query_foreign_access(s->grants_used[i]->gref)) { |
1017 | info->persistent_gnts_c++; | 1057 | /* |
1058 | * If the grant is still mapped by the backend (the | ||
1059 | * backend has chosen to make this grant persistent) | ||
1060 | * we add it at the head of the list, so it will be | ||
1061 | * reused first. | ||
1062 | */ | ||
1063 | if (!info->feature_persistent) | ||
1064 | pr_alert_ratelimited("backed has not unmapped grant: %u\n", | ||
1065 | s->grants_used[i]->gref); | ||
1066 | list_add(&s->grants_used[i]->node, &info->grants); | ||
1067 | info->persistent_gnts_c++; | ||
1068 | } else { | ||
1069 | /* | ||
1070 | * If the grant is not mapped by the backend we end the | ||
1071 | * foreign access and add it to the tail of the list, | ||
1072 | * so it will not be picked again unless we run out of | ||
1073 | * persistent grants. | ||
1074 | */ | ||
1075 | gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); | ||
1076 | s->grants_used[i]->gref = GRANT_INVALID_REF; | ||
1077 | list_add_tail(&s->grants_used[i]->node, &info->grants); | ||
1078 | } | ||
1018 | } | 1079 | } |
1019 | if (s->req.operation == BLKIF_OP_INDIRECT) { | 1080 | if (s->req.operation == BLKIF_OP_INDIRECT) { |
1020 | for (i = 0; i < INDIRECT_GREFS(nseg); i++) { | 1081 | for (i = 0; i < INDIRECT_GREFS(nseg); i++) { |
1021 | list_add(&s->indirect_grants[i]->node, &info->persistent_gnts); | 1082 | if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) { |
1022 | info->persistent_gnts_c++; | 1083 | if (!info->feature_persistent) |
1084 | pr_alert_ratelimited("backed has not unmapped grant: %u\n", | ||
1085 | s->indirect_grants[i]->gref); | ||
1086 | list_add(&s->indirect_grants[i]->node, &info->grants); | ||
1087 | info->persistent_gnts_c++; | ||
1088 | } else { | ||
1089 | struct page *indirect_page; | ||
1090 | |||
1091 | gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL); | ||
1092 | /* | ||
1093 | * Add the used indirect page back to the list of | ||
1094 | * available pages for indirect grefs. | ||
1095 | */ | ||
1096 | indirect_page = pfn_to_page(s->indirect_grants[i]->pfn); | ||
1097 | list_add(&indirect_page->lru, &info->indirect_pages); | ||
1098 | s->indirect_grants[i]->gref = GRANT_INVALID_REF; | ||
1099 | list_add_tail(&s->indirect_grants[i]->node, &info->grants); | ||
1100 | } | ||
1023 | } | 1101 | } |
1024 | } | 1102 | } |
1025 | } | 1103 | } |
@@ -1313,7 +1391,8 @@ static int blkfront_probe(struct xenbus_device *dev, | |||
1313 | spin_lock_init(&info->io_lock); | 1391 | spin_lock_init(&info->io_lock); |
1314 | info->xbdev = dev; | 1392 | info->xbdev = dev; |
1315 | info->vdevice = vdevice; | 1393 | info->vdevice = vdevice; |
1316 | INIT_LIST_HEAD(&info->persistent_gnts); | 1394 | INIT_LIST_HEAD(&info->grants); |
1395 | INIT_LIST_HEAD(&info->indirect_pages); | ||
1317 | info->persistent_gnts_c = 0; | 1396 | info->persistent_gnts_c = 0; |
1318 | info->connected = BLKIF_STATE_DISCONNECTED; | 1397 | info->connected = BLKIF_STATE_DISCONNECTED; |
1319 | INIT_WORK(&info->work, blkif_restart_queue); | 1398 | INIT_WORK(&info->work, blkif_restart_queue); |
@@ -1609,6 +1688,23 @@ static int blkfront_setup_indirect(struct blkfront_info *info) | |||
1609 | if (err) | 1688 | if (err) |
1610 | goto out_of_memory; | 1689 | goto out_of_memory; |
1611 | 1690 | ||
1691 | if (!info->feature_persistent && info->max_indirect_segments) { | ||
1692 | /* | ||
1693 | * We are using indirect descriptors but not persistent | ||
1694 | * grants, we need to allocate a set of pages that can be | ||
1695 | * used for mapping indirect grefs | ||
1696 | */ | ||
1697 | int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE; | ||
1698 | |||
1699 | BUG_ON(!list_empty(&info->indirect_pages)); | ||
1700 | for (i = 0; i < num; i++) { | ||
1701 | struct page *indirect_page = alloc_page(GFP_NOIO); | ||
1702 | if (!indirect_page) | ||
1703 | goto out_of_memory; | ||
1704 | list_add(&indirect_page->lru, &info->indirect_pages); | ||
1705 | } | ||
1706 | } | ||
1707 | |||
1612 | for (i = 0; i < BLK_RING_SIZE; i++) { | 1708 | for (i = 0; i < BLK_RING_SIZE; i++) { |
1613 | info->shadow[i].grants_used = kzalloc( | 1709 | info->shadow[i].grants_used = kzalloc( |
1614 | sizeof(info->shadow[i].grants_used[0]) * segs, | 1710 | sizeof(info->shadow[i].grants_used[0]) * segs, |
@@ -1639,6 +1735,13 @@ out_of_memory: | |||
1639 | kfree(info->shadow[i].indirect_grants); | 1735 | kfree(info->shadow[i].indirect_grants); |
1640 | info->shadow[i].indirect_grants = NULL; | 1736 | info->shadow[i].indirect_grants = NULL; |
1641 | } | 1737 | } |
1738 | if (!list_empty(&info->indirect_pages)) { | ||
1739 | struct page *indirect_page, *n; | ||
1740 | list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { | ||
1741 | list_del(&indirect_page->lru); | ||
1742 | __free_page(indirect_page); | ||
1743 | } | ||
1744 | } | ||
1642 | return -ENOMEM; | 1745 | return -ENOMEM; |
1643 | } | 1746 | } |
1644 | 1747 | ||
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index f950c9d29f3e..2638417b19aa 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig | |||
@@ -13,15 +13,8 @@ config BCACHE_DEBUG | |||
13 | ---help--- | 13 | ---help--- |
14 | Don't select this option unless you're a developer | 14 | Don't select this option unless you're a developer |
15 | 15 | ||
16 | Enables extra debugging tools (primarily a fuzz tester) | 16 | Enables extra debugging tools, allows expensive runtime checks to be |
17 | 17 | turned on. | |
18 | config BCACHE_EDEBUG | ||
19 | bool "Extended runtime checks" | ||
20 | depends on BCACHE | ||
21 | ---help--- | ||
22 | Don't select this option unless you're a developer | ||
23 | |||
24 | Enables extra runtime checks which significantly affect performance | ||
25 | 18 | ||
26 | config BCACHE_CLOSURES_DEBUG | 19 | config BCACHE_CLOSURES_DEBUG |
27 | bool "Debug closures" | 20 | bool "Debug closures" |
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index e45f5575fd4d..2b46bf1d7e40 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c | |||
@@ -63,13 +63,12 @@ | |||
63 | #include "bcache.h" | 63 | #include "bcache.h" |
64 | #include "btree.h" | 64 | #include "btree.h" |
65 | 65 | ||
66 | #include <linux/blkdev.h> | ||
66 | #include <linux/freezer.h> | 67 | #include <linux/freezer.h> |
67 | #include <linux/kthread.h> | 68 | #include <linux/kthread.h> |
68 | #include <linux/random.h> | 69 | #include <linux/random.h> |
69 | #include <trace/events/bcache.h> | 70 | #include <trace/events/bcache.h> |
70 | 71 | ||
71 | #define MAX_IN_FLIGHT_DISCARDS 8U | ||
72 | |||
73 | /* Bucket heap / gen */ | 72 | /* Bucket heap / gen */ |
74 | 73 | ||
75 | uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) | 74 | uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) |
@@ -121,75 +120,6 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) | |||
121 | mutex_unlock(&c->bucket_lock); | 120 | mutex_unlock(&c->bucket_lock); |
122 | } | 121 | } |
123 | 122 | ||
124 | /* Discard/TRIM */ | ||
125 | |||
126 | struct discard { | ||
127 | struct list_head list; | ||
128 | struct work_struct work; | ||
129 | struct cache *ca; | ||
130 | long bucket; | ||
131 | |||
132 | struct bio bio; | ||
133 | struct bio_vec bv; | ||
134 | }; | ||
135 | |||
136 | static void discard_finish(struct work_struct *w) | ||
137 | { | ||
138 | struct discard *d = container_of(w, struct discard, work); | ||
139 | struct cache *ca = d->ca; | ||
140 | char buf[BDEVNAME_SIZE]; | ||
141 | |||
142 | if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) { | ||
143 | pr_notice("discard error on %s, disabling", | ||
144 | bdevname(ca->bdev, buf)); | ||
145 | d->ca->discard = 0; | ||
146 | } | ||
147 | |||
148 | mutex_lock(&ca->set->bucket_lock); | ||
149 | |||
150 | fifo_push(&ca->free, d->bucket); | ||
151 | list_add(&d->list, &ca->discards); | ||
152 | atomic_dec(&ca->discards_in_flight); | ||
153 | |||
154 | mutex_unlock(&ca->set->bucket_lock); | ||
155 | |||
156 | closure_wake_up(&ca->set->bucket_wait); | ||
157 | wake_up_process(ca->alloc_thread); | ||
158 | |||
159 | closure_put(&ca->set->cl); | ||
160 | } | ||
161 | |||
162 | static void discard_endio(struct bio *bio, int error) | ||
163 | { | ||
164 | struct discard *d = container_of(bio, struct discard, bio); | ||
165 | schedule_work(&d->work); | ||
166 | } | ||
167 | |||
168 | static void do_discard(struct cache *ca, long bucket) | ||
169 | { | ||
170 | struct discard *d = list_first_entry(&ca->discards, | ||
171 | struct discard, list); | ||
172 | |||
173 | list_del(&d->list); | ||
174 | d->bucket = bucket; | ||
175 | |||
176 | atomic_inc(&ca->discards_in_flight); | ||
177 | closure_get(&ca->set->cl); | ||
178 | |||
179 | bio_init(&d->bio); | ||
180 | |||
181 | d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket); | ||
182 | d->bio.bi_bdev = ca->bdev; | ||
183 | d->bio.bi_rw = REQ_WRITE|REQ_DISCARD; | ||
184 | d->bio.bi_max_vecs = 1; | ||
185 | d->bio.bi_io_vec = d->bio.bi_inline_vecs; | ||
186 | d->bio.bi_size = bucket_bytes(ca); | ||
187 | d->bio.bi_end_io = discard_endio; | ||
188 | bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); | ||
189 | |||
190 | submit_bio(0, &d->bio); | ||
191 | } | ||
192 | |||
193 | /* Allocation */ | 123 | /* Allocation */ |
194 | 124 | ||
195 | static inline bool can_inc_bucket_gen(struct bucket *b) | 125 | static inline bool can_inc_bucket_gen(struct bucket *b) |
@@ -280,7 +210,7 @@ static void invalidate_buckets_lru(struct cache *ca) | |||
280 | * multiple times when it can't do anything | 210 | * multiple times when it can't do anything |
281 | */ | 211 | */ |
282 | ca->invalidate_needs_gc = 1; | 212 | ca->invalidate_needs_gc = 1; |
283 | bch_queue_gc(ca->set); | 213 | wake_up_gc(ca->set); |
284 | return; | 214 | return; |
285 | } | 215 | } |
286 | 216 | ||
@@ -305,7 +235,7 @@ static void invalidate_buckets_fifo(struct cache *ca) | |||
305 | 235 | ||
306 | if (++checked >= ca->sb.nbuckets) { | 236 | if (++checked >= ca->sb.nbuckets) { |
307 | ca->invalidate_needs_gc = 1; | 237 | ca->invalidate_needs_gc = 1; |
308 | bch_queue_gc(ca->set); | 238 | wake_up_gc(ca->set); |
309 | return; | 239 | return; |
310 | } | 240 | } |
311 | } | 241 | } |
@@ -330,7 +260,7 @@ static void invalidate_buckets_random(struct cache *ca) | |||
330 | 260 | ||
331 | if (++checked >= ca->sb.nbuckets / 2) { | 261 | if (++checked >= ca->sb.nbuckets / 2) { |
332 | ca->invalidate_needs_gc = 1; | 262 | ca->invalidate_needs_gc = 1; |
333 | bch_queue_gc(ca->set); | 263 | wake_up_gc(ca->set); |
334 | return; | 264 | return; |
335 | } | 265 | } |
336 | } | 266 | } |
@@ -398,16 +328,18 @@ static int bch_allocator_thread(void *arg) | |||
398 | else | 328 | else |
399 | break; | 329 | break; |
400 | 330 | ||
401 | allocator_wait(ca, (int) fifo_free(&ca->free) > | ||
402 | atomic_read(&ca->discards_in_flight)); | ||
403 | |||
404 | if (ca->discard) { | 331 | if (ca->discard) { |
405 | allocator_wait(ca, !list_empty(&ca->discards)); | 332 | mutex_unlock(&ca->set->bucket_lock); |
406 | do_discard(ca, bucket); | 333 | blkdev_issue_discard(ca->bdev, |
407 | } else { | 334 | bucket_to_sector(ca->set, bucket), |
408 | fifo_push(&ca->free, bucket); | 335 | ca->sb.block_size, GFP_KERNEL, 0); |
409 | closure_wake_up(&ca->set->bucket_wait); | 336 | mutex_lock(&ca->set->bucket_lock); |
410 | } | 337 | } |
338 | |||
339 | allocator_wait(ca, !fifo_full(&ca->free)); | ||
340 | |||
341 | fifo_push(&ca->free, bucket); | ||
342 | wake_up(&ca->set->bucket_wait); | ||
411 | } | 343 | } |
412 | 344 | ||
413 | /* | 345 | /* |
@@ -433,16 +365,40 @@ static int bch_allocator_thread(void *arg) | |||
433 | } | 365 | } |
434 | } | 366 | } |
435 | 367 | ||
436 | long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) | 368 | long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) |
437 | { | 369 | { |
438 | long r = -1; | 370 | DEFINE_WAIT(w); |
439 | again: | 371 | struct bucket *b; |
372 | long r; | ||
373 | |||
374 | /* fastpath */ | ||
375 | if (fifo_used(&ca->free) > ca->watermark[watermark]) { | ||
376 | fifo_pop(&ca->free, r); | ||
377 | goto out; | ||
378 | } | ||
379 | |||
380 | if (!wait) | ||
381 | return -1; | ||
382 | |||
383 | while (1) { | ||
384 | if (fifo_used(&ca->free) > ca->watermark[watermark]) { | ||
385 | fifo_pop(&ca->free, r); | ||
386 | break; | ||
387 | } | ||
388 | |||
389 | prepare_to_wait(&ca->set->bucket_wait, &w, | ||
390 | TASK_UNINTERRUPTIBLE); | ||
391 | |||
392 | mutex_unlock(&ca->set->bucket_lock); | ||
393 | schedule(); | ||
394 | mutex_lock(&ca->set->bucket_lock); | ||
395 | } | ||
396 | |||
397 | finish_wait(&ca->set->bucket_wait, &w); | ||
398 | out: | ||
440 | wake_up_process(ca->alloc_thread); | 399 | wake_up_process(ca->alloc_thread); |
441 | 400 | ||
442 | if (fifo_used(&ca->free) > ca->watermark[watermark] && | 401 | if (expensive_debug_checks(ca->set)) { |
443 | fifo_pop(&ca->free, r)) { | ||
444 | struct bucket *b = ca->buckets + r; | ||
445 | #ifdef CONFIG_BCACHE_EDEBUG | ||
446 | size_t iter; | 402 | size_t iter; |
447 | long i; | 403 | long i; |
448 | 404 | ||
@@ -455,36 +411,23 @@ again: | |||
455 | BUG_ON(i == r); | 411 | BUG_ON(i == r); |
456 | fifo_for_each(i, &ca->unused, iter) | 412 | fifo_for_each(i, &ca->unused, iter) |
457 | BUG_ON(i == r); | 413 | BUG_ON(i == r); |
458 | #endif | ||
459 | BUG_ON(atomic_read(&b->pin) != 1); | ||
460 | |||
461 | SET_GC_SECTORS_USED(b, ca->sb.bucket_size); | ||
462 | |||
463 | if (watermark <= WATERMARK_METADATA) { | ||
464 | SET_GC_MARK(b, GC_MARK_METADATA); | ||
465 | b->prio = BTREE_PRIO; | ||
466 | } else { | ||
467 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | ||
468 | b->prio = INITIAL_PRIO; | ||
469 | } | ||
470 | |||
471 | return r; | ||
472 | } | 414 | } |
473 | 415 | ||
474 | trace_bcache_alloc_fail(ca); | 416 | b = ca->buckets + r; |
475 | 417 | ||
476 | if (cl) { | 418 | BUG_ON(atomic_read(&b->pin) != 1); |
477 | closure_wait(&ca->set->bucket_wait, cl); | ||
478 | 419 | ||
479 | if (closure_blocking(cl)) { | 420 | SET_GC_SECTORS_USED(b, ca->sb.bucket_size); |
480 | mutex_unlock(&ca->set->bucket_lock); | 421 | |
481 | closure_sync(cl); | 422 | if (watermark <= WATERMARK_METADATA) { |
482 | mutex_lock(&ca->set->bucket_lock); | 423 | SET_GC_MARK(b, GC_MARK_METADATA); |
483 | goto again; | 424 | b->prio = BTREE_PRIO; |
484 | } | 425 | } else { |
426 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | ||
427 | b->prio = INITIAL_PRIO; | ||
485 | } | 428 | } |
486 | 429 | ||
487 | return -1; | 430 | return r; |
488 | } | 431 | } |
489 | 432 | ||
490 | void bch_bucket_free(struct cache_set *c, struct bkey *k) | 433 | void bch_bucket_free(struct cache_set *c, struct bkey *k) |
@@ -501,7 +444,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) | |||
501 | } | 444 | } |
502 | 445 | ||
503 | int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | 446 | int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, |
504 | struct bkey *k, int n, struct closure *cl) | 447 | struct bkey *k, int n, bool wait) |
505 | { | 448 | { |
506 | int i; | 449 | int i; |
507 | 450 | ||
@@ -514,7 +457,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | |||
514 | 457 | ||
515 | for (i = 0; i < n; i++) { | 458 | for (i = 0; i < n; i++) { |
516 | struct cache *ca = c->cache_by_alloc[i]; | 459 | struct cache *ca = c->cache_by_alloc[i]; |
517 | long b = bch_bucket_alloc(ca, watermark, cl); | 460 | long b = bch_bucket_alloc(ca, watermark, wait); |
518 | 461 | ||
519 | if (b == -1) | 462 | if (b == -1) |
520 | goto err; | 463 | goto err; |
@@ -529,22 +472,202 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | |||
529 | return 0; | 472 | return 0; |
530 | err: | 473 | err: |
531 | bch_bucket_free(c, k); | 474 | bch_bucket_free(c, k); |
532 | __bkey_put(c, k); | 475 | bkey_put(c, k); |
533 | return -1; | 476 | return -1; |
534 | } | 477 | } |
535 | 478 | ||
536 | int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | 479 | int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, |
537 | struct bkey *k, int n, struct closure *cl) | 480 | struct bkey *k, int n, bool wait) |
538 | { | 481 | { |
539 | int ret; | 482 | int ret; |
540 | mutex_lock(&c->bucket_lock); | 483 | mutex_lock(&c->bucket_lock); |
541 | ret = __bch_bucket_alloc_set(c, watermark, k, n, cl); | 484 | ret = __bch_bucket_alloc_set(c, watermark, k, n, wait); |
542 | mutex_unlock(&c->bucket_lock); | 485 | mutex_unlock(&c->bucket_lock); |
543 | return ret; | 486 | return ret; |
544 | } | 487 | } |
545 | 488 | ||
489 | /* Sector allocator */ | ||
490 | |||
491 | struct open_bucket { | ||
492 | struct list_head list; | ||
493 | unsigned last_write_point; | ||
494 | unsigned sectors_free; | ||
495 | BKEY_PADDED(key); | ||
496 | }; | ||
497 | |||
498 | /* | ||
499 | * We keep multiple buckets open for writes, and try to segregate different | ||
500 | * write streams for better cache utilization: first we look for a bucket where | ||
501 | * the last write to it was sequential with the current write, and failing that | ||
502 | * we look for a bucket that was last used by the same task. | ||
503 | * | ||
504 | * The ideas is if you've got multiple tasks pulling data into the cache at the | ||
505 | * same time, you'll get better cache utilization if you try to segregate their | ||
506 | * data and preserve locality. | ||
507 | * | ||
508 | * For example, say you've starting Firefox at the same time you're copying a | ||
509 | * bunch of files. Firefox will likely end up being fairly hot and stay in the | ||
510 | * cache awhile, but the data you copied might not be; if you wrote all that | ||
511 | * data to the same buckets it'd get invalidated at the same time. | ||
512 | * | ||
513 | * Both of those tasks will be doing fairly random IO so we can't rely on | ||
514 | * detecting sequential IO to segregate their data, but going off of the task | ||
515 | * should be a sane heuristic. | ||
516 | */ | ||
517 | static struct open_bucket *pick_data_bucket(struct cache_set *c, | ||
518 | const struct bkey *search, | ||
519 | unsigned write_point, | ||
520 | struct bkey *alloc) | ||
521 | { | ||
522 | struct open_bucket *ret, *ret_task = NULL; | ||
523 | |||
524 | list_for_each_entry_reverse(ret, &c->data_buckets, list) | ||
525 | if (!bkey_cmp(&ret->key, search)) | ||
526 | goto found; | ||
527 | else if (ret->last_write_point == write_point) | ||
528 | ret_task = ret; | ||
529 | |||
530 | ret = ret_task ?: list_first_entry(&c->data_buckets, | ||
531 | struct open_bucket, list); | ||
532 | found: | ||
533 | if (!ret->sectors_free && KEY_PTRS(alloc)) { | ||
534 | ret->sectors_free = c->sb.bucket_size; | ||
535 | bkey_copy(&ret->key, alloc); | ||
536 | bkey_init(alloc); | ||
537 | } | ||
538 | |||
539 | if (!ret->sectors_free) | ||
540 | ret = NULL; | ||
541 | |||
542 | return ret; | ||
543 | } | ||
544 | |||
545 | /* | ||
546 | * Allocates some space in the cache to write to, and k to point to the newly | ||
547 | * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the | ||
548 | * end of the newly allocated space). | ||
549 | * | ||
550 | * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many | ||
551 | * sectors were actually allocated. | ||
552 | * | ||
553 | * If s->writeback is true, will not fail. | ||
554 | */ | ||
555 | bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, | ||
556 | unsigned write_point, unsigned write_prio, bool wait) | ||
557 | { | ||
558 | struct open_bucket *b; | ||
559 | BKEY_PADDED(key) alloc; | ||
560 | unsigned i; | ||
561 | |||
562 | /* | ||
563 | * We might have to allocate a new bucket, which we can't do with a | ||
564 | * spinlock held. So if we have to allocate, we drop the lock, allocate | ||
565 | * and then retry. KEY_PTRS() indicates whether alloc points to | ||
566 | * allocated bucket(s). | ||
567 | */ | ||
568 | |||
569 | bkey_init(&alloc.key); | ||
570 | spin_lock(&c->data_bucket_lock); | ||
571 | |||
572 | while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { | ||
573 | unsigned watermark = write_prio | ||
574 | ? WATERMARK_MOVINGGC | ||
575 | : WATERMARK_NONE; | ||
576 | |||
577 | spin_unlock(&c->data_bucket_lock); | ||
578 | |||
579 | if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait)) | ||
580 | return false; | ||
581 | |||
582 | spin_lock(&c->data_bucket_lock); | ||
583 | } | ||
584 | |||
585 | /* | ||
586 | * If we had to allocate, we might race and not need to allocate the | ||
587 | * second time we call find_data_bucket(). If we allocated a bucket but | ||
588 | * didn't use it, drop the refcount bch_bucket_alloc_set() took: | ||
589 | */ | ||
590 | if (KEY_PTRS(&alloc.key)) | ||
591 | bkey_put(c, &alloc.key); | ||
592 | |||
593 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
594 | EBUG_ON(ptr_stale(c, &b->key, i)); | ||
595 | |||
596 | /* Set up the pointer to the space we're allocating: */ | ||
597 | |||
598 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
599 | k->ptr[i] = b->key.ptr[i]; | ||
600 | |||
601 | sectors = min(sectors, b->sectors_free); | ||
602 | |||
603 | SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); | ||
604 | SET_KEY_SIZE(k, sectors); | ||
605 | SET_KEY_PTRS(k, KEY_PTRS(&b->key)); | ||
606 | |||
607 | /* | ||
608 | * Move b to the end of the lru, and keep track of what this bucket was | ||
609 | * last used for: | ||
610 | */ | ||
611 | list_move_tail(&b->list, &c->data_buckets); | ||
612 | bkey_copy_key(&b->key, k); | ||
613 | b->last_write_point = write_point; | ||
614 | |||
615 | b->sectors_free -= sectors; | ||
616 | |||
617 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | ||
618 | SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); | ||
619 | |||
620 | atomic_long_add(sectors, | ||
621 | &PTR_CACHE(c, &b->key, i)->sectors_written); | ||
622 | } | ||
623 | |||
624 | if (b->sectors_free < c->sb.block_size) | ||
625 | b->sectors_free = 0; | ||
626 | |||
627 | /* | ||
628 | * k takes refcounts on the buckets it points to until it's inserted | ||
629 | * into the btree, but if we're done with this bucket we just transfer | ||
630 | * get_data_bucket()'s refcount. | ||
631 | */ | ||
632 | if (b->sectors_free) | ||
633 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
634 | atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); | ||
635 | |||
636 | spin_unlock(&c->data_bucket_lock); | ||
637 | return true; | ||
638 | } | ||
639 | |||
546 | /* Init */ | 640 | /* Init */ |
547 | 641 | ||
642 | void bch_open_buckets_free(struct cache_set *c) | ||
643 | { | ||
644 | struct open_bucket *b; | ||
645 | |||
646 | while (!list_empty(&c->data_buckets)) { | ||
647 | b = list_first_entry(&c->data_buckets, | ||
648 | struct open_bucket, list); | ||
649 | list_del(&b->list); | ||
650 | kfree(b); | ||
651 | } | ||
652 | } | ||
653 | |||
654 | int bch_open_buckets_alloc(struct cache_set *c) | ||
655 | { | ||
656 | int i; | ||
657 | |||
658 | spin_lock_init(&c->data_bucket_lock); | ||
659 | |||
660 | for (i = 0; i < 6; i++) { | ||
661 | struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); | ||
662 | if (!b) | ||
663 | return -ENOMEM; | ||
664 | |||
665 | list_add(&b->list, &c->data_buckets); | ||
666 | } | ||
667 | |||
668 | return 0; | ||
669 | } | ||
670 | |||
548 | int bch_cache_allocator_start(struct cache *ca) | 671 | int bch_cache_allocator_start(struct cache *ca) |
549 | { | 672 | { |
550 | struct task_struct *k = kthread_run(bch_allocator_thread, | 673 | struct task_struct *k = kthread_run(bch_allocator_thread, |
@@ -556,22 +679,8 @@ int bch_cache_allocator_start(struct cache *ca) | |||
556 | return 0; | 679 | return 0; |
557 | } | 680 | } |
558 | 681 | ||
559 | void bch_cache_allocator_exit(struct cache *ca) | ||
560 | { | ||
561 | struct discard *d; | ||
562 | |||
563 | while (!list_empty(&ca->discards)) { | ||
564 | d = list_first_entry(&ca->discards, struct discard, list); | ||
565 | cancel_work_sync(&d->work); | ||
566 | list_del(&d->list); | ||
567 | kfree(d); | ||
568 | } | ||
569 | } | ||
570 | |||
571 | int bch_cache_allocator_init(struct cache *ca) | 682 | int bch_cache_allocator_init(struct cache *ca) |
572 | { | 683 | { |
573 | unsigned i; | ||
574 | |||
575 | /* | 684 | /* |
576 | * Reserve: | 685 | * Reserve: |
577 | * Prio/gen writes first | 686 | * Prio/gen writes first |
@@ -589,15 +698,5 @@ int bch_cache_allocator_init(struct cache *ca) | |||
589 | ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + | 698 | ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + |
590 | ca->watermark[WATERMARK_MOVINGGC]; | 699 | ca->watermark[WATERMARK_MOVINGGC]; |
591 | 700 | ||
592 | for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) { | ||
593 | struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL); | ||
594 | if (!d) | ||
595 | return -ENOMEM; | ||
596 | |||
597 | d->ca = ca; | ||
598 | INIT_WORK(&d->work, discard_finish); | ||
599 | list_add(&d->list, &ca->discards); | ||
600 | } | ||
601 | |||
602 | return 0; | 701 | return 0; |
603 | } | 702 | } |
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 0f12382aa35d..4beb55a0ff30 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
@@ -177,6 +177,7 @@ | |||
177 | 177 | ||
178 | #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ | 178 | #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ |
179 | 179 | ||
180 | #include <linux/bcache.h> | ||
180 | #include <linux/bio.h> | 181 | #include <linux/bio.h> |
181 | #include <linux/kobject.h> | 182 | #include <linux/kobject.h> |
182 | #include <linux/list.h> | 183 | #include <linux/list.h> |
@@ -210,168 +211,6 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); | |||
210 | #define GC_MARK_METADATA 2 | 211 | #define GC_MARK_METADATA 2 |
211 | BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); | 212 | BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); |
212 | 213 | ||
213 | struct bkey { | ||
214 | uint64_t high; | ||
215 | uint64_t low; | ||
216 | uint64_t ptr[]; | ||
217 | }; | ||
218 | |||
219 | /* Enough for a key with 6 pointers */ | ||
220 | #define BKEY_PAD 8 | ||
221 | |||
222 | #define BKEY_PADDED(key) \ | ||
223 | union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; } | ||
224 | |||
225 | /* Version 0: Cache device | ||
226 | * Version 1: Backing device | ||
227 | * Version 2: Seed pointer into btree node checksum | ||
228 | * Version 3: Cache device with new UUID format | ||
229 | * Version 4: Backing device with data offset | ||
230 | */ | ||
231 | #define BCACHE_SB_VERSION_CDEV 0 | ||
232 | #define BCACHE_SB_VERSION_BDEV 1 | ||
233 | #define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 | ||
234 | #define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 | ||
235 | #define BCACHE_SB_MAX_VERSION 4 | ||
236 | |||
237 | #define SB_SECTOR 8 | ||
238 | #define SB_SIZE 4096 | ||
239 | #define SB_LABEL_SIZE 32 | ||
240 | #define SB_JOURNAL_BUCKETS 256U | ||
241 | /* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ | ||
242 | #define MAX_CACHES_PER_SET 8 | ||
243 | |||
244 | #define BDEV_DATA_START_DEFAULT 16 /* sectors */ | ||
245 | |||
246 | struct cache_sb { | ||
247 | uint64_t csum; | ||
248 | uint64_t offset; /* sector where this sb was written */ | ||
249 | uint64_t version; | ||
250 | |||
251 | uint8_t magic[16]; | ||
252 | |||
253 | uint8_t uuid[16]; | ||
254 | union { | ||
255 | uint8_t set_uuid[16]; | ||
256 | uint64_t set_magic; | ||
257 | }; | ||
258 | uint8_t label[SB_LABEL_SIZE]; | ||
259 | |||
260 | uint64_t flags; | ||
261 | uint64_t seq; | ||
262 | uint64_t pad[8]; | ||
263 | |||
264 | union { | ||
265 | struct { | ||
266 | /* Cache devices */ | ||
267 | uint64_t nbuckets; /* device size */ | ||
268 | |||
269 | uint16_t block_size; /* sectors */ | ||
270 | uint16_t bucket_size; /* sectors */ | ||
271 | |||
272 | uint16_t nr_in_set; | ||
273 | uint16_t nr_this_dev; | ||
274 | }; | ||
275 | struct { | ||
276 | /* Backing devices */ | ||
277 | uint64_t data_offset; | ||
278 | |||
279 | /* | ||
280 | * block_size from the cache device section is still used by | ||
281 | * backing devices, so don't add anything here until we fix | ||
282 | * things to not need it for backing devices anymore | ||
283 | */ | ||
284 | }; | ||
285 | }; | ||
286 | |||
287 | uint32_t last_mount; /* time_t */ | ||
288 | |||
289 | uint16_t first_bucket; | ||
290 | union { | ||
291 | uint16_t njournal_buckets; | ||
292 | uint16_t keys; | ||
293 | }; | ||
294 | uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */ | ||
295 | }; | ||
296 | |||
297 | BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); | ||
298 | BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); | ||
299 | BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); | ||
300 | #define CACHE_REPLACEMENT_LRU 0U | ||
301 | #define CACHE_REPLACEMENT_FIFO 1U | ||
302 | #define CACHE_REPLACEMENT_RANDOM 2U | ||
303 | |||
304 | BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); | ||
305 | #define CACHE_MODE_WRITETHROUGH 0U | ||
306 | #define CACHE_MODE_WRITEBACK 1U | ||
307 | #define CACHE_MODE_WRITEAROUND 2U | ||
308 | #define CACHE_MODE_NONE 3U | ||
309 | BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); | ||
310 | #define BDEV_STATE_NONE 0U | ||
311 | #define BDEV_STATE_CLEAN 1U | ||
312 | #define BDEV_STATE_DIRTY 2U | ||
313 | #define BDEV_STATE_STALE 3U | ||
314 | |||
315 | /* Version 1: Seed pointer into btree node checksum | ||
316 | */ | ||
317 | #define BCACHE_BSET_VERSION 1 | ||
318 | |||
319 | /* | ||
320 | * This is the on disk format for btree nodes - a btree node on disk is a list | ||
321 | * of these; within each set the keys are sorted | ||
322 | */ | ||
323 | struct bset { | ||
324 | uint64_t csum; | ||
325 | uint64_t magic; | ||
326 | uint64_t seq; | ||
327 | uint32_t version; | ||
328 | uint32_t keys; | ||
329 | |||
330 | union { | ||
331 | struct bkey start[0]; | ||
332 | uint64_t d[0]; | ||
333 | }; | ||
334 | }; | ||
335 | |||
336 | /* | ||
337 | * On disk format for priorities and gens - see super.c near prio_write() for | ||
338 | * more. | ||
339 | */ | ||
340 | struct prio_set { | ||
341 | uint64_t csum; | ||
342 | uint64_t magic; | ||
343 | uint64_t seq; | ||
344 | uint32_t version; | ||
345 | uint32_t pad; | ||
346 | |||
347 | uint64_t next_bucket; | ||
348 | |||
349 | struct bucket_disk { | ||
350 | uint16_t prio; | ||
351 | uint8_t gen; | ||
352 | } __attribute((packed)) data[]; | ||
353 | }; | ||
354 | |||
355 | struct uuid_entry { | ||
356 | union { | ||
357 | struct { | ||
358 | uint8_t uuid[16]; | ||
359 | uint8_t label[32]; | ||
360 | uint32_t first_reg; | ||
361 | uint32_t last_reg; | ||
362 | uint32_t invalidated; | ||
363 | |||
364 | uint32_t flags; | ||
365 | /* Size of flash only volumes */ | ||
366 | uint64_t sectors; | ||
367 | }; | ||
368 | |||
369 | uint8_t pad[128]; | ||
370 | }; | ||
371 | }; | ||
372 | |||
373 | BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); | ||
374 | |||
375 | #include "journal.h" | 214 | #include "journal.h" |
376 | #include "stats.h" | 215 | #include "stats.h" |
377 | struct search; | 216 | struct search; |
@@ -384,8 +223,6 @@ struct keybuf_key { | |||
384 | void *private; | 223 | void *private; |
385 | }; | 224 | }; |
386 | 225 | ||
387 | typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); | ||
388 | |||
389 | struct keybuf { | 226 | struct keybuf { |
390 | struct bkey last_scanned; | 227 | struct bkey last_scanned; |
391 | spinlock_t lock; | 228 | spinlock_t lock; |
@@ -400,7 +237,7 @@ struct keybuf { | |||
400 | 237 | ||
401 | struct rb_root keys; | 238 | struct rb_root keys; |
402 | 239 | ||
403 | #define KEYBUF_NR 100 | 240 | #define KEYBUF_NR 500 |
404 | DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); | 241 | DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); |
405 | }; | 242 | }; |
406 | 243 | ||
@@ -429,16 +266,15 @@ struct bcache_device { | |||
429 | 266 | ||
430 | struct gendisk *disk; | 267 | struct gendisk *disk; |
431 | 268 | ||
432 | /* If nonzero, we're closing */ | 269 | unsigned long flags; |
433 | atomic_t closing; | 270 | #define BCACHE_DEV_CLOSING 0 |
434 | 271 | #define BCACHE_DEV_DETACHING 1 | |
435 | /* If nonzero, we're detaching/unregistering from cache set */ | 272 | #define BCACHE_DEV_UNLINK_DONE 2 |
436 | atomic_t detaching; | ||
437 | int flush_done; | ||
438 | 273 | ||
439 | uint64_t nr_stripes; | 274 | unsigned nr_stripes; |
440 | unsigned stripe_size_bits; | 275 | unsigned stripe_size; |
441 | atomic_t *stripe_sectors_dirty; | 276 | atomic_t *stripe_sectors_dirty; |
277 | unsigned long *full_dirty_stripes; | ||
442 | 278 | ||
443 | unsigned long sectors_dirty_last; | 279 | unsigned long sectors_dirty_last; |
444 | long sectors_dirty_derivative; | 280 | long sectors_dirty_derivative; |
@@ -509,7 +345,7 @@ struct cached_dev { | |||
509 | 345 | ||
510 | /* Limit number of writeback bios in flight */ | 346 | /* Limit number of writeback bios in flight */ |
511 | struct semaphore in_flight; | 347 | struct semaphore in_flight; |
512 | struct closure_with_timer writeback; | 348 | struct task_struct *writeback_thread; |
513 | 349 | ||
514 | struct keybuf writeback_keys; | 350 | struct keybuf writeback_keys; |
515 | 351 | ||
@@ -527,8 +363,8 @@ struct cached_dev { | |||
527 | unsigned sequential_cutoff; | 363 | unsigned sequential_cutoff; |
528 | unsigned readahead; | 364 | unsigned readahead; |
529 | 365 | ||
530 | unsigned sequential_merge:1; | ||
531 | unsigned verify:1; | 366 | unsigned verify:1; |
367 | unsigned bypass_torture_test:1; | ||
532 | 368 | ||
533 | unsigned partial_stripes_expensive:1; | 369 | unsigned partial_stripes_expensive:1; |
534 | unsigned writeback_metadata:1; | 370 | unsigned writeback_metadata:1; |
@@ -620,15 +456,6 @@ struct cache { | |||
620 | 456 | ||
621 | bool discard; /* Get rid of? */ | 457 | bool discard; /* Get rid of? */ |
622 | 458 | ||
623 | /* | ||
624 | * We preallocate structs for issuing discards to buckets, and keep them | ||
625 | * on this list when they're not in use; do_discard() issues discards | ||
626 | * whenever there's work to do and is called by free_some_buckets() and | ||
627 | * when a discard finishes. | ||
628 | */ | ||
629 | atomic_t discards_in_flight; | ||
630 | struct list_head discards; | ||
631 | |||
632 | struct journal_device journal; | 459 | struct journal_device journal; |
633 | 460 | ||
634 | /* The rest of this all shows up in sysfs */ | 461 | /* The rest of this all shows up in sysfs */ |
@@ -649,7 +476,6 @@ struct gc_stat { | |||
649 | 476 | ||
650 | size_t nkeys; | 477 | size_t nkeys; |
651 | uint64_t data; /* sectors */ | 478 | uint64_t data; /* sectors */ |
652 | uint64_t dirty; /* sectors */ | ||
653 | unsigned in_use; /* percent */ | 479 | unsigned in_use; /* percent */ |
654 | }; | 480 | }; |
655 | 481 | ||
@@ -744,8 +570,8 @@ struct cache_set { | |||
744 | * basically a lock for this that we can wait on asynchronously. The | 570 | * basically a lock for this that we can wait on asynchronously. The |
745 | * btree_root() macro releases the lock when it returns. | 571 | * btree_root() macro releases the lock when it returns. |
746 | */ | 572 | */ |
747 | struct closure *try_harder; | 573 | struct task_struct *try_harder; |
748 | struct closure_waitlist try_wait; | 574 | wait_queue_head_t try_wait; |
749 | uint64_t try_harder_start; | 575 | uint64_t try_harder_start; |
750 | 576 | ||
751 | /* | 577 | /* |
@@ -759,7 +585,7 @@ struct cache_set { | |||
759 | * written. | 585 | * written. |
760 | */ | 586 | */ |
761 | atomic_t prio_blocked; | 587 | atomic_t prio_blocked; |
762 | struct closure_waitlist bucket_wait; | 588 | wait_queue_head_t bucket_wait; |
763 | 589 | ||
764 | /* | 590 | /* |
765 | * For any bio we don't skip we subtract the number of sectors from | 591 | * For any bio we don't skip we subtract the number of sectors from |
@@ -782,7 +608,7 @@ struct cache_set { | |||
782 | struct gc_stat gc_stats; | 608 | struct gc_stat gc_stats; |
783 | size_t nbuckets; | 609 | size_t nbuckets; |
784 | 610 | ||
785 | struct closure_with_waitlist gc; | 611 | struct task_struct *gc_thread; |
786 | /* Where in the btree gc currently is */ | 612 | /* Where in the btree gc currently is */ |
787 | struct bkey gc_done; | 613 | struct bkey gc_done; |
788 | 614 | ||
@@ -795,11 +621,10 @@ struct cache_set { | |||
795 | /* Counts how many sectors bio_insert has added to the cache */ | 621 | /* Counts how many sectors bio_insert has added to the cache */ |
796 | atomic_t sectors_to_gc; | 622 | atomic_t sectors_to_gc; |
797 | 623 | ||
798 | struct closure moving_gc; | 624 | wait_queue_head_t moving_gc_wait; |
799 | struct closure_waitlist moving_gc_wait; | ||
800 | struct keybuf moving_gc_keys; | 625 | struct keybuf moving_gc_keys; |
801 | /* Number of moving GC bios in flight */ | 626 | /* Number of moving GC bios in flight */ |
802 | atomic_t in_flight; | 627 | struct semaphore moving_in_flight; |
803 | 628 | ||
804 | struct btree *root; | 629 | struct btree *root; |
805 | 630 | ||
@@ -841,22 +666,27 @@ struct cache_set { | |||
841 | unsigned congested_read_threshold_us; | 666 | unsigned congested_read_threshold_us; |
842 | unsigned congested_write_threshold_us; | 667 | unsigned congested_write_threshold_us; |
843 | 668 | ||
844 | spinlock_t sort_time_lock; | ||
845 | struct time_stats sort_time; | 669 | struct time_stats sort_time; |
846 | struct time_stats btree_gc_time; | 670 | struct time_stats btree_gc_time; |
847 | struct time_stats btree_split_time; | 671 | struct time_stats btree_split_time; |
848 | spinlock_t btree_read_time_lock; | ||
849 | struct time_stats btree_read_time; | 672 | struct time_stats btree_read_time; |
850 | struct time_stats try_harder_time; | 673 | struct time_stats try_harder_time; |
851 | 674 | ||
852 | atomic_long_t cache_read_races; | 675 | atomic_long_t cache_read_races; |
853 | atomic_long_t writeback_keys_done; | 676 | atomic_long_t writeback_keys_done; |
854 | atomic_long_t writeback_keys_failed; | 677 | atomic_long_t writeback_keys_failed; |
678 | |||
679 | enum { | ||
680 | ON_ERROR_UNREGISTER, | ||
681 | ON_ERROR_PANIC, | ||
682 | } on_error; | ||
855 | unsigned error_limit; | 683 | unsigned error_limit; |
856 | unsigned error_decay; | 684 | unsigned error_decay; |
685 | |||
857 | unsigned short journal_delay_ms; | 686 | unsigned short journal_delay_ms; |
858 | unsigned verify:1; | 687 | unsigned verify:1; |
859 | unsigned key_merging_disabled:1; | 688 | unsigned key_merging_disabled:1; |
689 | unsigned expensive_debug_checks:1; | ||
860 | unsigned gc_always_rewrite:1; | 690 | unsigned gc_always_rewrite:1; |
861 | unsigned shrinker_disabled:1; | 691 | unsigned shrinker_disabled:1; |
862 | unsigned copy_gc_enabled:1; | 692 | unsigned copy_gc_enabled:1; |
@@ -865,21 +695,6 @@ struct cache_set { | |||
865 | struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; | 695 | struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; |
866 | }; | 696 | }; |
867 | 697 | ||
868 | static inline bool key_merging_disabled(struct cache_set *c) | ||
869 | { | ||
870 | #ifdef CONFIG_BCACHE_DEBUG | ||
871 | return c->key_merging_disabled; | ||
872 | #else | ||
873 | return 0; | ||
874 | #endif | ||
875 | } | ||
876 | |||
877 | static inline bool SB_IS_BDEV(const struct cache_sb *sb) | ||
878 | { | ||
879 | return sb->version == BCACHE_SB_VERSION_BDEV | ||
880 | || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; | ||
881 | } | ||
882 | |||
883 | struct bbio { | 698 | struct bbio { |
884 | unsigned submit_time_us; | 699 | unsigned submit_time_us; |
885 | union { | 700 | union { |
@@ -933,59 +748,6 @@ static inline unsigned local_clock_us(void) | |||
933 | #define prio_buckets(c) \ | 748 | #define prio_buckets(c) \ |
934 | DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) | 749 | DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) |
935 | 750 | ||
936 | #define JSET_MAGIC 0x245235c1a3625032ULL | ||
937 | #define PSET_MAGIC 0x6750e15f87337f91ULL | ||
938 | #define BSET_MAGIC 0x90135c78b99e07f5ULL | ||
939 | |||
940 | #define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC) | ||
941 | #define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC) | ||
942 | #define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC) | ||
943 | |||
944 | /* Bkey fields: all units are in sectors */ | ||
945 | |||
946 | #define KEY_FIELD(name, field, offset, size) \ | ||
947 | BITMASK(name, struct bkey, field, offset, size) | ||
948 | |||
949 | #define PTR_FIELD(name, offset, size) \ | ||
950 | static inline uint64_t name(const struct bkey *k, unsigned i) \ | ||
951 | { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \ | ||
952 | \ | ||
953 | static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\ | ||
954 | { \ | ||
955 | k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \ | ||
956 | k->ptr[i] |= v << offset; \ | ||
957 | } | ||
958 | |||
959 | KEY_FIELD(KEY_PTRS, high, 60, 3) | ||
960 | KEY_FIELD(HEADER_SIZE, high, 58, 2) | ||
961 | KEY_FIELD(KEY_CSUM, high, 56, 2) | ||
962 | KEY_FIELD(KEY_PINNED, high, 55, 1) | ||
963 | KEY_FIELD(KEY_DIRTY, high, 36, 1) | ||
964 | |||
965 | KEY_FIELD(KEY_SIZE, high, 20, 16) | ||
966 | KEY_FIELD(KEY_INODE, high, 0, 20) | ||
967 | |||
968 | /* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ | ||
969 | |||
970 | static inline uint64_t KEY_OFFSET(const struct bkey *k) | ||
971 | { | ||
972 | return k->low; | ||
973 | } | ||
974 | |||
975 | static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v) | ||
976 | { | ||
977 | k->low = v; | ||
978 | } | ||
979 | |||
980 | PTR_FIELD(PTR_DEV, 51, 12) | ||
981 | PTR_FIELD(PTR_OFFSET, 8, 43) | ||
982 | PTR_FIELD(PTR_GEN, 0, 8) | ||
983 | |||
984 | #define PTR_CHECK_DEV ((1 << 12) - 1) | ||
985 | |||
986 | #define PTR(gen, offset, dev) \ | ||
987 | ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen) | ||
988 | |||
989 | static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) | 751 | static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) |
990 | { | 752 | { |
991 | return s >> c->bucket_bits; | 753 | return s >> c->bucket_bits; |
@@ -1024,27 +786,11 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, | |||
1024 | 786 | ||
1025 | /* Btree key macros */ | 787 | /* Btree key macros */ |
1026 | 788 | ||
1027 | /* | ||
1028 | * The high bit being set is a relic from when we used it to do binary | ||
1029 | * searches - it told you where a key started. It's not used anymore, | ||
1030 | * and can probably be safely dropped. | ||
1031 | */ | ||
1032 | #define KEY(dev, sector, len) \ | ||
1033 | ((struct bkey) { \ | ||
1034 | .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \ | ||
1035 | .low = (sector) \ | ||
1036 | }) | ||
1037 | |||
1038 | static inline void bkey_init(struct bkey *k) | 789 | static inline void bkey_init(struct bkey *k) |
1039 | { | 790 | { |
1040 | *k = KEY(0, 0, 0); | 791 | *k = ZERO_KEY; |
1041 | } | 792 | } |
1042 | 793 | ||
1043 | #define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) | ||
1044 | #define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) | ||
1045 | #define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0) | ||
1046 | #define ZERO_KEY KEY(0, 0, 0) | ||
1047 | |||
1048 | /* | 794 | /* |
1049 | * This is used for various on disk data structures - cache_sb, prio_set, bset, | 795 | * This is used for various on disk data structures - cache_sb, prio_set, bset, |
1050 | * jset: The checksum is _always_ the first 8 bytes of these structs | 796 | * jset: The checksum is _always_ the first 8 bytes of these structs |
@@ -1094,14 +840,6 @@ do { \ | |||
1094 | for (b = (ca)->buckets + (ca)->sb.first_bucket; \ | 840 | for (b = (ca)->buckets + (ca)->sb.first_bucket; \ |
1095 | b < (ca)->buckets + (ca)->sb.nbuckets; b++) | 841 | b < (ca)->buckets + (ca)->sb.nbuckets; b++) |
1096 | 842 | ||
1097 | static inline void __bkey_put(struct cache_set *c, struct bkey *k) | ||
1098 | { | ||
1099 | unsigned i; | ||
1100 | |||
1101 | for (i = 0; i < KEY_PTRS(k); i++) | ||
1102 | atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); | ||
1103 | } | ||
1104 | |||
1105 | static inline void cached_dev_put(struct cached_dev *dc) | 843 | static inline void cached_dev_put(struct cached_dev *dc) |
1106 | { | 844 | { |
1107 | if (atomic_dec_and_test(&dc->count)) | 845 | if (atomic_dec_and_test(&dc->count)) |
@@ -1173,13 +911,15 @@ uint8_t bch_inc_gen(struct cache *, struct bucket *); | |||
1173 | void bch_rescale_priorities(struct cache_set *, int); | 911 | void bch_rescale_priorities(struct cache_set *, int); |
1174 | bool bch_bucket_add_unused(struct cache *, struct bucket *); | 912 | bool bch_bucket_add_unused(struct cache *, struct bucket *); |
1175 | 913 | ||
1176 | long bch_bucket_alloc(struct cache *, unsigned, struct closure *); | 914 | long bch_bucket_alloc(struct cache *, unsigned, bool); |
1177 | void bch_bucket_free(struct cache_set *, struct bkey *); | 915 | void bch_bucket_free(struct cache_set *, struct bkey *); |
1178 | 916 | ||
1179 | int __bch_bucket_alloc_set(struct cache_set *, unsigned, | 917 | int __bch_bucket_alloc_set(struct cache_set *, unsigned, |
1180 | struct bkey *, int, struct closure *); | 918 | struct bkey *, int, bool); |
1181 | int bch_bucket_alloc_set(struct cache_set *, unsigned, | 919 | int bch_bucket_alloc_set(struct cache_set *, unsigned, |
1182 | struct bkey *, int, struct closure *); | 920 | struct bkey *, int, bool); |
921 | bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, | ||
922 | unsigned, unsigned, bool); | ||
1183 | 923 | ||
1184 | __printf(2, 3) | 924 | __printf(2, 3) |
1185 | bool bch_cache_set_error(struct cache_set *, const char *, ...); | 925 | bool bch_cache_set_error(struct cache_set *, const char *, ...); |
@@ -1187,7 +927,7 @@ bool bch_cache_set_error(struct cache_set *, const char *, ...); | |||
1187 | void bch_prio_write(struct cache *); | 927 | void bch_prio_write(struct cache *); |
1188 | void bch_write_bdev_super(struct cached_dev *, struct closure *); | 928 | void bch_write_bdev_super(struct cached_dev *, struct closure *); |
1189 | 929 | ||
1190 | extern struct workqueue_struct *bcache_wq, *bch_gc_wq; | 930 | extern struct workqueue_struct *bcache_wq; |
1191 | extern const char * const bch_cache_modes[]; | 931 | extern const char * const bch_cache_modes[]; |
1192 | extern struct mutex bch_register_lock; | 932 | extern struct mutex bch_register_lock; |
1193 | extern struct list_head bch_cache_sets; | 933 | extern struct list_head bch_cache_sets; |
@@ -1220,15 +960,14 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *); | |||
1220 | void bch_btree_cache_free(struct cache_set *); | 960 | void bch_btree_cache_free(struct cache_set *); |
1221 | int bch_btree_cache_alloc(struct cache_set *); | 961 | int bch_btree_cache_alloc(struct cache_set *); |
1222 | void bch_moving_init_cache_set(struct cache_set *); | 962 | void bch_moving_init_cache_set(struct cache_set *); |
963 | int bch_open_buckets_alloc(struct cache_set *); | ||
964 | void bch_open_buckets_free(struct cache_set *); | ||
1223 | 965 | ||
1224 | int bch_cache_allocator_start(struct cache *ca); | 966 | int bch_cache_allocator_start(struct cache *ca); |
1225 | void bch_cache_allocator_exit(struct cache *ca); | ||
1226 | int bch_cache_allocator_init(struct cache *ca); | 967 | int bch_cache_allocator_init(struct cache *ca); |
1227 | 968 | ||
1228 | void bch_debug_exit(void); | 969 | void bch_debug_exit(void); |
1229 | int bch_debug_init(struct kobject *); | 970 | int bch_debug_init(struct kobject *); |
1230 | void bch_writeback_exit(void); | ||
1231 | int bch_writeback_init(void); | ||
1232 | void bch_request_exit(void); | 971 | void bch_request_exit(void); |
1233 | int bch_request_init(void); | 972 | int bch_request_init(void); |
1234 | void bch_btree_exit(void); | 973 | void bch_btree_exit(void); |
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 22d1ae72c282..7d388b8bb50e 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c | |||
@@ -14,22 +14,12 @@ | |||
14 | 14 | ||
15 | /* Keylists */ | 15 | /* Keylists */ |
16 | 16 | ||
17 | void bch_keylist_copy(struct keylist *dest, struct keylist *src) | ||
18 | { | ||
19 | *dest = *src; | ||
20 | |||
21 | if (src->list == src->d) { | ||
22 | size_t n = (uint64_t *) src->top - src->d; | ||
23 | dest->top = (struct bkey *) &dest->d[n]; | ||
24 | dest->list = dest->d; | ||
25 | } | ||
26 | } | ||
27 | |||
28 | int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) | 17 | int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) |
29 | { | 18 | { |
30 | unsigned oldsize = (uint64_t *) l->top - l->list; | 19 | size_t oldsize = bch_keylist_nkeys(l); |
31 | unsigned newsize = oldsize + 2 + nptrs; | 20 | size_t newsize = oldsize + 2 + nptrs; |
32 | uint64_t *new; | 21 | uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; |
22 | uint64_t *new_keys; | ||
33 | 23 | ||
34 | /* The journalling code doesn't handle the case where the keys to insert | 24 | /* The journalling code doesn't handle the case where the keys to insert |
35 | * is bigger than an empty write: If we just return -ENOMEM here, | 25 | * is bigger than an empty write: If we just return -ENOMEM here, |
@@ -45,24 +35,23 @@ int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c) | |||
45 | roundup_pow_of_two(oldsize) == newsize) | 35 | roundup_pow_of_two(oldsize) == newsize) |
46 | return 0; | 36 | return 0; |
47 | 37 | ||
48 | new = krealloc(l->list == l->d ? NULL : l->list, | 38 | new_keys = krealloc(old_keys, sizeof(uint64_t) * newsize, GFP_NOIO); |
49 | sizeof(uint64_t) * newsize, GFP_NOIO); | ||
50 | 39 | ||
51 | if (!new) | 40 | if (!new_keys) |
52 | return -ENOMEM; | 41 | return -ENOMEM; |
53 | 42 | ||
54 | if (l->list == l->d) | 43 | if (!old_keys) |
55 | memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE); | 44 | memcpy(new_keys, l->inline_keys, sizeof(uint64_t) * oldsize); |
56 | 45 | ||
57 | l->list = new; | 46 | l->keys_p = new_keys; |
58 | l->top = (struct bkey *) (&l->list[oldsize]); | 47 | l->top_p = new_keys + oldsize; |
59 | 48 | ||
60 | return 0; | 49 | return 0; |
61 | } | 50 | } |
62 | 51 | ||
63 | struct bkey *bch_keylist_pop(struct keylist *l) | 52 | struct bkey *bch_keylist_pop(struct keylist *l) |
64 | { | 53 | { |
65 | struct bkey *k = l->bottom; | 54 | struct bkey *k = l->keys; |
66 | 55 | ||
67 | if (k == l->top) | 56 | if (k == l->top) |
68 | return NULL; | 57 | return NULL; |
@@ -73,21 +62,20 @@ struct bkey *bch_keylist_pop(struct keylist *l) | |||
73 | return l->top = k; | 62 | return l->top = k; |
74 | } | 63 | } |
75 | 64 | ||
76 | /* Pointer validation */ | 65 | void bch_keylist_pop_front(struct keylist *l) |
77 | |||
78 | bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) | ||
79 | { | 66 | { |
80 | unsigned i; | 67 | l->top_p -= bkey_u64s(l->keys); |
81 | char buf[80]; | ||
82 | 68 | ||
83 | if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) | 69 | memmove(l->keys, |
84 | goto bad; | 70 | bkey_next(l->keys), |
71 | bch_keylist_bytes(l)); | ||
72 | } | ||
85 | 73 | ||
86 | if (!level && KEY_SIZE(k) > KEY_OFFSET(k)) | 74 | /* Pointer validation */ |
87 | goto bad; | ||
88 | 75 | ||
89 | if (!KEY_SIZE(k)) | 76 | static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) |
90 | return true; | 77 | { |
78 | unsigned i; | ||
91 | 79 | ||
92 | for (i = 0; i < KEY_PTRS(k); i++) | 80 | for (i = 0; i < KEY_PTRS(k); i++) |
93 | if (ptr_available(c, k, i)) { | 81 | if (ptr_available(c, k, i)) { |
@@ -98,13 +86,83 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) | |||
98 | if (KEY_SIZE(k) + r > c->sb.bucket_size || | 86 | if (KEY_SIZE(k) + r > c->sb.bucket_size || |
99 | bucket < ca->sb.first_bucket || | 87 | bucket < ca->sb.first_bucket || |
100 | bucket >= ca->sb.nbuckets) | 88 | bucket >= ca->sb.nbuckets) |
101 | goto bad; | 89 | return true; |
102 | } | 90 | } |
103 | 91 | ||
104 | return false; | 92 | return false; |
93 | } | ||
94 | |||
95 | bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) | ||
96 | { | ||
97 | char buf[80]; | ||
98 | |||
99 | if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) | ||
100 | goto bad; | ||
101 | |||
102 | if (__ptr_invalid(c, k)) | ||
103 | goto bad; | ||
104 | |||
105 | return false; | ||
106 | bad: | ||
107 | bch_bkey_to_text(buf, sizeof(buf), k); | ||
108 | cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); | ||
109 | return true; | ||
110 | } | ||
111 | |||
112 | bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k) | ||
113 | { | ||
114 | char buf[80]; | ||
115 | |||
116 | if (!KEY_SIZE(k)) | ||
117 | return true; | ||
118 | |||
119 | if (KEY_SIZE(k) > KEY_OFFSET(k)) | ||
120 | goto bad; | ||
121 | |||
122 | if (__ptr_invalid(c, k)) | ||
123 | goto bad; | ||
124 | |||
125 | return false; | ||
105 | bad: | 126 | bad: |
106 | bch_bkey_to_text(buf, sizeof(buf), k); | 127 | bch_bkey_to_text(buf, sizeof(buf), k); |
107 | cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k)); | 128 | cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k)); |
129 | return true; | ||
130 | } | ||
131 | |||
132 | static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k, | ||
133 | unsigned ptr) | ||
134 | { | ||
135 | struct bucket *g = PTR_BUCKET(b->c, k, ptr); | ||
136 | char buf[80]; | ||
137 | |||
138 | if (mutex_trylock(&b->c->bucket_lock)) { | ||
139 | if (b->level) { | ||
140 | if (KEY_DIRTY(k) || | ||
141 | g->prio != BTREE_PRIO || | ||
142 | (b->c->gc_mark_valid && | ||
143 | GC_MARK(g) != GC_MARK_METADATA)) | ||
144 | goto err; | ||
145 | |||
146 | } else { | ||
147 | if (g->prio == BTREE_PRIO) | ||
148 | goto err; | ||
149 | |||
150 | if (KEY_DIRTY(k) && | ||
151 | b->c->gc_mark_valid && | ||
152 | GC_MARK(g) != GC_MARK_DIRTY) | ||
153 | goto err; | ||
154 | } | ||
155 | mutex_unlock(&b->c->bucket_lock); | ||
156 | } | ||
157 | |||
158 | return false; | ||
159 | err: | ||
160 | mutex_unlock(&b->c->bucket_lock); | ||
161 | bch_bkey_to_text(buf, sizeof(buf), k); | ||
162 | btree_bug(b, | ||
163 | "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | ||
164 | buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), | ||
165 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | ||
108 | return true; | 166 | return true; |
109 | } | 167 | } |
110 | 168 | ||
@@ -118,64 +176,29 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k) | |||
118 | bch_ptr_invalid(b, k)) | 176 | bch_ptr_invalid(b, k)) |
119 | return true; | 177 | return true; |
120 | 178 | ||
121 | if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV) | 179 | for (i = 0; i < KEY_PTRS(k); i++) { |
122 | return true; | 180 | if (!ptr_available(b->c, k, i)) |
181 | return true; | ||
123 | 182 | ||
124 | for (i = 0; i < KEY_PTRS(k); i++) | 183 | g = PTR_BUCKET(b->c, k, i); |
125 | if (ptr_available(b->c, k, i)) { | 184 | stale = ptr_stale(b->c, k, i); |
126 | g = PTR_BUCKET(b->c, k, i); | ||
127 | stale = ptr_stale(b->c, k, i); | ||
128 | 185 | ||
129 | btree_bug_on(stale > 96, b, | 186 | btree_bug_on(stale > 96, b, |
130 | "key too stale: %i, need_gc %u", | 187 | "key too stale: %i, need_gc %u", |
131 | stale, b->c->need_gc); | 188 | stale, b->c->need_gc); |
132 | 189 | ||
133 | btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), | 190 | btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), |
134 | b, "stale dirty pointer"); | 191 | b, "stale dirty pointer"); |
135 | 192 | ||
136 | if (stale) | 193 | if (stale) |
137 | return true; | 194 | return true; |
138 | 195 | ||
139 | #ifdef CONFIG_BCACHE_EDEBUG | 196 | if (expensive_debug_checks(b->c) && |
140 | if (!mutex_trylock(&b->c->bucket_lock)) | 197 | ptr_bad_expensive_checks(b, k, i)) |
141 | continue; | 198 | return true; |
142 | 199 | } | |
143 | if (b->level) { | ||
144 | if (KEY_DIRTY(k) || | ||
145 | g->prio != BTREE_PRIO || | ||
146 | (b->c->gc_mark_valid && | ||
147 | GC_MARK(g) != GC_MARK_METADATA)) | ||
148 | goto bug; | ||
149 | |||
150 | } else { | ||
151 | if (g->prio == BTREE_PRIO) | ||
152 | goto bug; | ||
153 | |||
154 | if (KEY_DIRTY(k) && | ||
155 | b->c->gc_mark_valid && | ||
156 | GC_MARK(g) != GC_MARK_DIRTY) | ||
157 | goto bug; | ||
158 | } | ||
159 | mutex_unlock(&b->c->bucket_lock); | ||
160 | #endif | ||
161 | } | ||
162 | 200 | ||
163 | return false; | 201 | return false; |
164 | #ifdef CONFIG_BCACHE_EDEBUG | ||
165 | bug: | ||
166 | mutex_unlock(&b->c->bucket_lock); | ||
167 | |||
168 | { | ||
169 | char buf[80]; | ||
170 | |||
171 | bch_bkey_to_text(buf, sizeof(buf), k); | ||
172 | btree_bug(b, | ||
173 | "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | ||
174 | buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), | ||
175 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | ||
176 | } | ||
177 | return true; | ||
178 | #endif | ||
179 | } | 202 | } |
180 | 203 | ||
181 | /* Key/pointer manipulation */ | 204 | /* Key/pointer manipulation */ |
@@ -458,16 +481,8 @@ static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline) | |||
458 | 481 | ||
459 | static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) | 482 | static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) |
460 | { | 483 | { |
461 | #ifdef CONFIG_X86_64 | ||
462 | asm("shrd %[shift],%[high],%[low]" | ||
463 | : [low] "+Rm" (low) | ||
464 | : [high] "R" (high), | ||
465 | [shift] "ci" (shift) | ||
466 | : "cc"); | ||
467 | #else | ||
468 | low >>= shift; | 484 | low >>= shift; |
469 | low |= (high << 1) << (63U - shift); | 485 | low |= (high << 1) << (63U - shift); |
470 | #endif | ||
471 | return low; | 486 | return low; |
472 | } | 487 | } |
473 | 488 | ||
@@ -686,7 +701,7 @@ void bch_bset_init_next(struct btree *b) | |||
686 | } else | 701 | } else |
687 | get_random_bytes(&i->seq, sizeof(uint64_t)); | 702 | get_random_bytes(&i->seq, sizeof(uint64_t)); |
688 | 703 | ||
689 | i->magic = bset_magic(b->c); | 704 | i->magic = bset_magic(&b->c->sb); |
690 | i->version = 0; | 705 | i->version = 0; |
691 | i->keys = 0; | 706 | i->keys = 0; |
692 | 707 | ||
@@ -824,16 +839,16 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, | |||
824 | } else | 839 | } else |
825 | i = bset_search_write_set(b, t, search); | 840 | i = bset_search_write_set(b, t, search); |
826 | 841 | ||
827 | #ifdef CONFIG_BCACHE_EDEBUG | 842 | if (expensive_debug_checks(b->c)) { |
828 | BUG_ON(bset_written(b, t) && | 843 | BUG_ON(bset_written(b, t) && |
829 | i.l != t->data->start && | 844 | i.l != t->data->start && |
830 | bkey_cmp(tree_to_prev_bkey(t, | 845 | bkey_cmp(tree_to_prev_bkey(t, |
831 | inorder_to_tree(bkey_to_cacheline(t, i.l), t)), | 846 | inorder_to_tree(bkey_to_cacheline(t, i.l), t)), |
832 | search) > 0); | 847 | search) > 0); |
833 | 848 | ||
834 | BUG_ON(i.r != end(t->data) && | 849 | BUG_ON(i.r != end(t->data) && |
835 | bkey_cmp(i.r, search) <= 0); | 850 | bkey_cmp(i.r, search) <= 0); |
836 | #endif | 851 | } |
837 | 852 | ||
838 | while (likely(i.l != i.r) && | 853 | while (likely(i.l != i.r) && |
839 | bkey_cmp(i.l, search) <= 0) | 854 | bkey_cmp(i.l, search) <= 0) |
@@ -844,6 +859,13 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, | |||
844 | 859 | ||
845 | /* Btree iterator */ | 860 | /* Btree iterator */ |
846 | 861 | ||
862 | /* | ||
863 | * Returns true if l > r - unless l == r, in which case returns true if l is | ||
864 | * older than r. | ||
865 | * | ||
866 | * Necessary for btree_sort_fixup() - if there are multiple keys that compare | ||
867 | * equal in different sets, we have to process them newest to oldest. | ||
868 | */ | ||
847 | static inline bool btree_iter_cmp(struct btree_iter_set l, | 869 | static inline bool btree_iter_cmp(struct btree_iter_set l, |
848 | struct btree_iter_set r) | 870 | struct btree_iter_set r) |
849 | { | 871 | { |
@@ -867,12 +889,16 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, | |||
867 | } | 889 | } |
868 | 890 | ||
869 | struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, | 891 | struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, |
870 | struct bkey *search, struct bset_tree *start) | 892 | struct bkey *search, struct bset_tree *start) |
871 | { | 893 | { |
872 | struct bkey *ret = NULL; | 894 | struct bkey *ret = NULL; |
873 | iter->size = ARRAY_SIZE(iter->data); | 895 | iter->size = ARRAY_SIZE(iter->data); |
874 | iter->used = 0; | 896 | iter->used = 0; |
875 | 897 | ||
898 | #ifdef CONFIG_BCACHE_DEBUG | ||
899 | iter->b = b; | ||
900 | #endif | ||
901 | |||
876 | for (; start <= &b->sets[b->nsets]; start++) { | 902 | for (; start <= &b->sets[b->nsets]; start++) { |
877 | ret = bch_bset_search(b, start, search); | 903 | ret = bch_bset_search(b, start, search); |
878 | bch_btree_iter_push(iter, ret, end(start->data)); | 904 | bch_btree_iter_push(iter, ret, end(start->data)); |
@@ -887,6 +913,8 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) | |||
887 | struct bkey *ret = NULL; | 913 | struct bkey *ret = NULL; |
888 | 914 | ||
889 | if (!btree_iter_end(iter)) { | 915 | if (!btree_iter_end(iter)) { |
916 | bch_btree_iter_next_check(iter); | ||
917 | |||
890 | ret = iter->data->k; | 918 | ret = iter->data->k; |
891 | iter->data->k = bkey_next(iter->data->k); | 919 | iter->data->k = bkey_next(iter->data->k); |
892 | 920 | ||
@@ -916,14 +944,6 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, | |||
916 | return ret; | 944 | return ret; |
917 | } | 945 | } |
918 | 946 | ||
919 | struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search) | ||
920 | { | ||
921 | struct btree_iter iter; | ||
922 | |||
923 | bch_btree_iter_init(b, &iter, search); | ||
924 | return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | ||
925 | } | ||
926 | |||
927 | /* Mergesort */ | 947 | /* Mergesort */ |
928 | 948 | ||
929 | static void sort_key_next(struct btree_iter *iter, | 949 | static void sort_key_next(struct btree_iter *iter, |
@@ -998,7 +1018,6 @@ static void btree_mergesort(struct btree *b, struct bset *out, | |||
998 | out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0; | 1018 | out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0; |
999 | 1019 | ||
1000 | pr_debug("sorted %i keys", out->keys); | 1020 | pr_debug("sorted %i keys", out->keys); |
1001 | bch_check_key_order(b, out); | ||
1002 | } | 1021 | } |
1003 | 1022 | ||
1004 | static void __btree_sort(struct btree *b, struct btree_iter *iter, | 1023 | static void __btree_sort(struct btree *b, struct btree_iter *iter, |
@@ -1029,7 +1048,7 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, | |||
1029 | * memcpy() | 1048 | * memcpy() |
1030 | */ | 1049 | */ |
1031 | 1050 | ||
1032 | out->magic = bset_magic(b->c); | 1051 | out->magic = bset_magic(&b->c->sb); |
1033 | out->seq = b->sets[0].data->seq; | 1052 | out->seq = b->sets[0].data->seq; |
1034 | out->version = b->sets[0].data->version; | 1053 | out->version = b->sets[0].data->version; |
1035 | swap(out, b->sets[0].data); | 1054 | swap(out, b->sets[0].data); |
@@ -1050,24 +1069,21 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, | |||
1050 | if (b->written) | 1069 | if (b->written) |
1051 | bset_build_written_tree(b); | 1070 | bset_build_written_tree(b); |
1052 | 1071 | ||
1053 | if (!start) { | 1072 | if (!start) |
1054 | spin_lock(&b->c->sort_time_lock); | ||
1055 | bch_time_stats_update(&b->c->sort_time, start_time); | 1073 | bch_time_stats_update(&b->c->sort_time, start_time); |
1056 | spin_unlock(&b->c->sort_time_lock); | ||
1057 | } | ||
1058 | } | 1074 | } |
1059 | 1075 | ||
1060 | void bch_btree_sort_partial(struct btree *b, unsigned start) | 1076 | void bch_btree_sort_partial(struct btree *b, unsigned start) |
1061 | { | 1077 | { |
1062 | size_t oldsize = 0, order = b->page_order, keys = 0; | 1078 | size_t order = b->page_order, keys = 0; |
1063 | struct btree_iter iter; | 1079 | struct btree_iter iter; |
1080 | int oldsize = bch_count_data(b); | ||
1081 | |||
1064 | __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); | 1082 | __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); |
1065 | 1083 | ||
1066 | BUG_ON(b->sets[b->nsets].data == write_block(b) && | 1084 | BUG_ON(b->sets[b->nsets].data == write_block(b) && |
1067 | (b->sets[b->nsets].size || b->nsets)); | 1085 | (b->sets[b->nsets].size || b->nsets)); |
1068 | 1086 | ||
1069 | if (b->written) | ||
1070 | oldsize = bch_count_data(b); | ||
1071 | 1087 | ||
1072 | if (start) { | 1088 | if (start) { |
1073 | unsigned i; | 1089 | unsigned i; |
@@ -1083,7 +1099,7 @@ void bch_btree_sort_partial(struct btree *b, unsigned start) | |||
1083 | 1099 | ||
1084 | __btree_sort(b, &iter, start, order, false); | 1100 | __btree_sort(b, &iter, start, order, false); |
1085 | 1101 | ||
1086 | EBUG_ON(b->written && bch_count_data(b) != oldsize); | 1102 | EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); |
1087 | } | 1103 | } |
1088 | 1104 | ||
1089 | void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) | 1105 | void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) |
@@ -1101,9 +1117,7 @@ void bch_btree_sort_into(struct btree *b, struct btree *new) | |||
1101 | 1117 | ||
1102 | btree_mergesort(b, new->sets->data, &iter, false, true); | 1118 | btree_mergesort(b, new->sets->data, &iter, false, true); |
1103 | 1119 | ||
1104 | spin_lock(&b->c->sort_time_lock); | ||
1105 | bch_time_stats_update(&b->c->sort_time, start_time); | 1120 | bch_time_stats_update(&b->c->sort_time, start_time); |
1106 | spin_unlock(&b->c->sort_time_lock); | ||
1107 | 1121 | ||
1108 | bkey_copy_key(&new->key, &b->key); | 1122 | bkey_copy_key(&new->key, &b->key); |
1109 | new->sets->size = 0; | 1123 | new->sets->size = 0; |
@@ -1148,16 +1162,16 @@ out: | |||
1148 | /* Sysfs stuff */ | 1162 | /* Sysfs stuff */ |
1149 | 1163 | ||
1150 | struct bset_stats { | 1164 | struct bset_stats { |
1165 | struct btree_op op; | ||
1151 | size_t nodes; | 1166 | size_t nodes; |
1152 | size_t sets_written, sets_unwritten; | 1167 | size_t sets_written, sets_unwritten; |
1153 | size_t bytes_written, bytes_unwritten; | 1168 | size_t bytes_written, bytes_unwritten; |
1154 | size_t floats, failed; | 1169 | size_t floats, failed; |
1155 | }; | 1170 | }; |
1156 | 1171 | ||
1157 | static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, | 1172 | static int btree_bset_stats(struct btree_op *op, struct btree *b) |
1158 | struct bset_stats *stats) | ||
1159 | { | 1173 | { |
1160 | struct bkey *k; | 1174 | struct bset_stats *stats = container_of(op, struct bset_stats, op); |
1161 | unsigned i; | 1175 | unsigned i; |
1162 | 1176 | ||
1163 | stats->nodes++; | 1177 | stats->nodes++; |
@@ -1182,30 +1196,19 @@ static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, | |||
1182 | } | 1196 | } |
1183 | } | 1197 | } |
1184 | 1198 | ||
1185 | if (b->level) { | 1199 | return MAP_CONTINUE; |
1186 | struct btree_iter iter; | ||
1187 | |||
1188 | for_each_key_filter(b, k, &iter, bch_ptr_bad) { | ||
1189 | int ret = btree(bset_stats, k, b, op, stats); | ||
1190 | if (ret) | ||
1191 | return ret; | ||
1192 | } | ||
1193 | } | ||
1194 | |||
1195 | return 0; | ||
1196 | } | 1200 | } |
1197 | 1201 | ||
1198 | int bch_bset_print_stats(struct cache_set *c, char *buf) | 1202 | int bch_bset_print_stats(struct cache_set *c, char *buf) |
1199 | { | 1203 | { |
1200 | struct btree_op op; | ||
1201 | struct bset_stats t; | 1204 | struct bset_stats t; |
1202 | int ret; | 1205 | int ret; |
1203 | 1206 | ||
1204 | bch_btree_op_init_stack(&op); | ||
1205 | memset(&t, 0, sizeof(struct bset_stats)); | 1207 | memset(&t, 0, sizeof(struct bset_stats)); |
1208 | bch_btree_op_init(&t.op, -1); | ||
1206 | 1209 | ||
1207 | ret = btree_root(bset_stats, c, &op, &t); | 1210 | ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats); |
1208 | if (ret) | 1211 | if (ret < 0) |
1209 | return ret; | 1212 | return ret; |
1210 | 1213 | ||
1211 | return snprintf(buf, PAGE_SIZE, | 1214 | return snprintf(buf, PAGE_SIZE, |
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index ae115a253d73..1d3c24f9fa0e 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h | |||
@@ -148,6 +148,9 @@ | |||
148 | 148 | ||
149 | struct btree_iter { | 149 | struct btree_iter { |
150 | size_t size, used; | 150 | size_t size, used; |
151 | #ifdef CONFIG_BCACHE_DEBUG | ||
152 | struct btree *b; | ||
153 | #endif | ||
151 | struct btree_iter_set { | 154 | struct btree_iter_set { |
152 | struct bkey *k, *end; | 155 | struct bkey *k, *end; |
153 | } data[MAX_BSETS]; | 156 | } data[MAX_BSETS]; |
@@ -193,54 +196,26 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l, | |||
193 | : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); | 196 | : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); |
194 | } | 197 | } |
195 | 198 | ||
196 | static inline size_t bkey_u64s(const struct bkey *k) | ||
197 | { | ||
198 | BUG_ON(KEY_CSUM(k) > 1); | ||
199 | return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0); | ||
200 | } | ||
201 | |||
202 | static inline size_t bkey_bytes(const struct bkey *k) | ||
203 | { | ||
204 | return bkey_u64s(k) * sizeof(uint64_t); | ||
205 | } | ||
206 | |||
207 | static inline void bkey_copy(struct bkey *dest, const struct bkey *src) | ||
208 | { | ||
209 | memcpy(dest, src, bkey_bytes(src)); | ||
210 | } | ||
211 | |||
212 | static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) | ||
213 | { | ||
214 | if (!src) | ||
215 | src = &KEY(0, 0, 0); | ||
216 | |||
217 | SET_KEY_INODE(dest, KEY_INODE(src)); | ||
218 | SET_KEY_OFFSET(dest, KEY_OFFSET(src)); | ||
219 | } | ||
220 | |||
221 | static inline struct bkey *bkey_next(const struct bkey *k) | ||
222 | { | ||
223 | uint64_t *d = (void *) k; | ||
224 | return (struct bkey *) (d + bkey_u64s(k)); | ||
225 | } | ||
226 | |||
227 | /* Keylists */ | 199 | /* Keylists */ |
228 | 200 | ||
229 | struct keylist { | 201 | struct keylist { |
230 | struct bkey *top; | ||
231 | union { | 202 | union { |
232 | uint64_t *list; | 203 | struct bkey *keys; |
233 | struct bkey *bottom; | 204 | uint64_t *keys_p; |
205 | }; | ||
206 | union { | ||
207 | struct bkey *top; | ||
208 | uint64_t *top_p; | ||
234 | }; | 209 | }; |
235 | 210 | ||
236 | /* Enough room for btree_split's keys without realloc */ | 211 | /* Enough room for btree_split's keys without realloc */ |
237 | #define KEYLIST_INLINE 16 | 212 | #define KEYLIST_INLINE 16 |
238 | uint64_t d[KEYLIST_INLINE]; | 213 | uint64_t inline_keys[KEYLIST_INLINE]; |
239 | }; | 214 | }; |
240 | 215 | ||
241 | static inline void bch_keylist_init(struct keylist *l) | 216 | static inline void bch_keylist_init(struct keylist *l) |
242 | { | 217 | { |
243 | l->top = (void *) (l->list = l->d); | 218 | l->top_p = l->keys_p = l->inline_keys; |
244 | } | 219 | } |
245 | 220 | ||
246 | static inline void bch_keylist_push(struct keylist *l) | 221 | static inline void bch_keylist_push(struct keylist *l) |
@@ -256,17 +231,32 @@ static inline void bch_keylist_add(struct keylist *l, struct bkey *k) | |||
256 | 231 | ||
257 | static inline bool bch_keylist_empty(struct keylist *l) | 232 | static inline bool bch_keylist_empty(struct keylist *l) |
258 | { | 233 | { |
259 | return l->top == (void *) l->list; | 234 | return l->top == l->keys; |
235 | } | ||
236 | |||
237 | static inline void bch_keylist_reset(struct keylist *l) | ||
238 | { | ||
239 | l->top = l->keys; | ||
260 | } | 240 | } |
261 | 241 | ||
262 | static inline void bch_keylist_free(struct keylist *l) | 242 | static inline void bch_keylist_free(struct keylist *l) |
263 | { | 243 | { |
264 | if (l->list != l->d) | 244 | if (l->keys_p != l->inline_keys) |
265 | kfree(l->list); | 245 | kfree(l->keys_p); |
246 | } | ||
247 | |||
248 | static inline size_t bch_keylist_nkeys(struct keylist *l) | ||
249 | { | ||
250 | return l->top_p - l->keys_p; | ||
251 | } | ||
252 | |||
253 | static inline size_t bch_keylist_bytes(struct keylist *l) | ||
254 | { | ||
255 | return bch_keylist_nkeys(l) * sizeof(uint64_t); | ||
266 | } | 256 | } |
267 | 257 | ||
268 | void bch_keylist_copy(struct keylist *, struct keylist *); | ||
269 | struct bkey *bch_keylist_pop(struct keylist *); | 258 | struct bkey *bch_keylist_pop(struct keylist *); |
259 | void bch_keylist_pop_front(struct keylist *); | ||
270 | int bch_keylist_realloc(struct keylist *, int, struct cache_set *); | 260 | int bch_keylist_realloc(struct keylist *, int, struct cache_set *); |
271 | 261 | ||
272 | void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, | 262 | void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, |
@@ -287,7 +277,9 @@ static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) | |||
287 | } | 277 | } |
288 | 278 | ||
289 | const char *bch_ptr_status(struct cache_set *, const struct bkey *); | 279 | const char *bch_ptr_status(struct cache_set *, const struct bkey *); |
290 | bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *); | 280 | bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); |
281 | bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *); | ||
282 | |||
291 | bool bch_ptr_bad(struct btree *, const struct bkey *); | 283 | bool bch_ptr_bad(struct btree *, const struct bkey *); |
292 | 284 | ||
293 | static inline uint8_t gen_after(uint8_t a, uint8_t b) | 285 | static inline uint8_t gen_after(uint8_t a, uint8_t b) |
@@ -311,7 +303,6 @@ static inline bool ptr_available(struct cache_set *c, const struct bkey *k, | |||
311 | 303 | ||
312 | typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); | 304 | typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *); |
313 | 305 | ||
314 | struct bkey *bch_next_recurse_key(struct btree *, struct bkey *); | ||
315 | struct bkey *bch_btree_iter_next(struct btree_iter *); | 306 | struct bkey *bch_btree_iter_next(struct btree_iter *); |
316 | struct bkey *bch_btree_iter_next_filter(struct btree_iter *, | 307 | struct bkey *bch_btree_iter_next_filter(struct btree_iter *, |
317 | struct btree *, ptr_filter_fn); | 308 | struct btree *, ptr_filter_fn); |
@@ -361,12 +352,30 @@ void bch_bset_fix_lookup_table(struct btree *, struct bkey *); | |||
361 | struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, | 352 | struct bkey *__bch_bset_search(struct btree *, struct bset_tree *, |
362 | const struct bkey *); | 353 | const struct bkey *); |
363 | 354 | ||
355 | /* | ||
356 | * Returns the first key that is strictly greater than search | ||
357 | */ | ||
364 | static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, | 358 | static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t, |
365 | const struct bkey *search) | 359 | const struct bkey *search) |
366 | { | 360 | { |
367 | return search ? __bch_bset_search(b, t, search) : t->data->start; | 361 | return search ? __bch_bset_search(b, t, search) : t->data->start; |
368 | } | 362 | } |
369 | 363 | ||
364 | #define PRECEDING_KEY(_k) \ | ||
365 | ({ \ | ||
366 | struct bkey *_ret = NULL; \ | ||
367 | \ | ||
368 | if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ | ||
369 | _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ | ||
370 | \ | ||
371 | if (!_ret->low) \ | ||
372 | _ret->high--; \ | ||
373 | _ret->low--; \ | ||
374 | } \ | ||
375 | \ | ||
376 | _ret; \ | ||
377 | }) | ||
378 | |||
370 | bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); | 379 | bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *); |
371 | void bch_btree_sort_lazy(struct btree *); | 380 | void bch_btree_sort_lazy(struct btree *); |
372 | void bch_btree_sort_into(struct btree *, struct btree *); | 381 | void bch_btree_sort_into(struct btree *, struct btree *); |
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index f42fc7ed9cd6..5e2765aadce1 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c | |||
@@ -23,12 +23,13 @@ | |||
23 | #include "bcache.h" | 23 | #include "bcache.h" |
24 | #include "btree.h" | 24 | #include "btree.h" |
25 | #include "debug.h" | 25 | #include "debug.h" |
26 | #include "request.h" | ||
27 | #include "writeback.h" | 26 | #include "writeback.h" |
28 | 27 | ||
29 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
30 | #include <linux/bitops.h> | 29 | #include <linux/bitops.h> |
30 | #include <linux/freezer.h> | ||
31 | #include <linux/hash.h> | 31 | #include <linux/hash.h> |
32 | #include <linux/kthread.h> | ||
32 | #include <linux/prefetch.h> | 33 | #include <linux/prefetch.h> |
33 | #include <linux/random.h> | 34 | #include <linux/random.h> |
34 | #include <linux/rcupdate.h> | 35 | #include <linux/rcupdate.h> |
@@ -88,15 +89,13 @@ | |||
88 | * Test module load/unload | 89 | * Test module load/unload |
89 | */ | 90 | */ |
90 | 91 | ||
91 | static const char * const op_types[] = { | 92 | enum { |
92 | "insert", "replace" | 93 | BTREE_INSERT_STATUS_INSERT, |
94 | BTREE_INSERT_STATUS_BACK_MERGE, | ||
95 | BTREE_INSERT_STATUS_OVERWROTE, | ||
96 | BTREE_INSERT_STATUS_FRONT_MERGE, | ||
93 | }; | 97 | }; |
94 | 98 | ||
95 | static const char *op_type(struct btree_op *op) | ||
96 | { | ||
97 | return op_types[op->type]; | ||
98 | } | ||
99 | |||
100 | #define MAX_NEED_GC 64 | 99 | #define MAX_NEED_GC 64 |
101 | #define MAX_SAVE_PRIO 72 | 100 | #define MAX_SAVE_PRIO 72 |
102 | 101 | ||
@@ -105,23 +104,89 @@ static const char *op_type(struct btree_op *op) | |||
105 | #define PTR_HASH(c, k) \ | 104 | #define PTR_HASH(c, k) \ |
106 | (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) | 105 | (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) |
107 | 106 | ||
108 | struct workqueue_struct *bch_gc_wq; | ||
109 | static struct workqueue_struct *btree_io_wq; | 107 | static struct workqueue_struct *btree_io_wq; |
110 | 108 | ||
111 | void bch_btree_op_init_stack(struct btree_op *op) | 109 | static inline bool should_split(struct btree *b) |
112 | { | 110 | { |
113 | memset(op, 0, sizeof(struct btree_op)); | 111 | struct bset *i = write_block(b); |
114 | closure_init_stack(&op->cl); | 112 | return b->written >= btree_blocks(b) || |
115 | op->lock = -1; | 113 | (b->written + __set_blocks(i, i->keys + 15, b->c) |
116 | bch_keylist_init(&op->keys); | 114 | > btree_blocks(b)); |
117 | } | 115 | } |
118 | 116 | ||
117 | #define insert_lock(s, b) ((b)->level <= (s)->lock) | ||
118 | |||
119 | /* | ||
120 | * These macros are for recursing down the btree - they handle the details of | ||
121 | * locking and looking up nodes in the cache for you. They're best treated as | ||
122 | * mere syntax when reading code that uses them. | ||
123 | * | ||
124 | * op->lock determines whether we take a read or a write lock at a given depth. | ||
125 | * If you've got a read lock and find that you need a write lock (i.e. you're | ||
126 | * going to have to split), set op->lock and return -EINTR; btree_root() will | ||
127 | * call you again and you'll have the correct lock. | ||
128 | */ | ||
129 | |||
130 | /** | ||
131 | * btree - recurse down the btree on a specified key | ||
132 | * @fn: function to call, which will be passed the child node | ||
133 | * @key: key to recurse on | ||
134 | * @b: parent btree node | ||
135 | * @op: pointer to struct btree_op | ||
136 | */ | ||
137 | #define btree(fn, key, b, op, ...) \ | ||
138 | ({ \ | ||
139 | int _r, l = (b)->level - 1; \ | ||
140 | bool _w = l <= (op)->lock; \ | ||
141 | struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ | ||
142 | if (!IS_ERR(_child)) { \ | ||
143 | _child->parent = (b); \ | ||
144 | _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ | ||
145 | rw_unlock(_w, _child); \ | ||
146 | } else \ | ||
147 | _r = PTR_ERR(_child); \ | ||
148 | _r; \ | ||
149 | }) | ||
150 | |||
151 | /** | ||
152 | * btree_root - call a function on the root of the btree | ||
153 | * @fn: function to call, which will be passed the child node | ||
154 | * @c: cache set | ||
155 | * @op: pointer to struct btree_op | ||
156 | */ | ||
157 | #define btree_root(fn, c, op, ...) \ | ||
158 | ({ \ | ||
159 | int _r = -EINTR; \ | ||
160 | do { \ | ||
161 | struct btree *_b = (c)->root; \ | ||
162 | bool _w = insert_lock(op, _b); \ | ||
163 | rw_lock(_w, _b, _b->level); \ | ||
164 | if (_b == (c)->root && \ | ||
165 | _w == insert_lock(op, _b)) { \ | ||
166 | _b->parent = NULL; \ | ||
167 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | ||
168 | } \ | ||
169 | rw_unlock(_w, _b); \ | ||
170 | bch_cannibalize_unlock(c); \ | ||
171 | if (_r == -ENOSPC) { \ | ||
172 | wait_event((c)->try_wait, \ | ||
173 | !(c)->try_harder); \ | ||
174 | _r = -EINTR; \ | ||
175 | } \ | ||
176 | } while (_r == -EINTR); \ | ||
177 | \ | ||
178 | _r; \ | ||
179 | }) | ||
180 | |||
119 | /* Btree key manipulation */ | 181 | /* Btree key manipulation */ |
120 | 182 | ||
121 | static void bkey_put(struct cache_set *c, struct bkey *k, int level) | 183 | void bkey_put(struct cache_set *c, struct bkey *k) |
122 | { | 184 | { |
123 | if ((level && KEY_OFFSET(k)) || !level) | 185 | unsigned i; |
124 | __bkey_put(c, k); | 186 | |
187 | for (i = 0; i < KEY_PTRS(k); i++) | ||
188 | if (ptr_available(c, k, i)) | ||
189 | atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); | ||
125 | } | 190 | } |
126 | 191 | ||
127 | /* Btree IO */ | 192 | /* Btree IO */ |
@@ -145,6 +210,10 @@ static void bch_btree_node_read_done(struct btree *b) | |||
145 | iter->size = b->c->sb.bucket_size / b->c->sb.block_size; | 210 | iter->size = b->c->sb.bucket_size / b->c->sb.block_size; |
146 | iter->used = 0; | 211 | iter->used = 0; |
147 | 212 | ||
213 | #ifdef CONFIG_BCACHE_DEBUG | ||
214 | iter->b = b; | ||
215 | #endif | ||
216 | |||
148 | if (!i->seq) | 217 | if (!i->seq) |
149 | goto err; | 218 | goto err; |
150 | 219 | ||
@@ -160,7 +229,7 @@ static void bch_btree_node_read_done(struct btree *b) | |||
160 | goto err; | 229 | goto err; |
161 | 230 | ||
162 | err = "bad magic"; | 231 | err = "bad magic"; |
163 | if (i->magic != bset_magic(b->c)) | 232 | if (i->magic != bset_magic(&b->c->sb)) |
164 | goto err; | 233 | goto err; |
165 | 234 | ||
166 | err = "bad checksum"; | 235 | err = "bad checksum"; |
@@ -248,10 +317,7 @@ void bch_btree_node_read(struct btree *b) | |||
248 | goto err; | 317 | goto err; |
249 | 318 | ||
250 | bch_btree_node_read_done(b); | 319 | bch_btree_node_read_done(b); |
251 | |||
252 | spin_lock(&b->c->btree_read_time_lock); | ||
253 | bch_time_stats_update(&b->c->btree_read_time, start_time); | 320 | bch_time_stats_update(&b->c->btree_read_time, start_time); |
254 | spin_unlock(&b->c->btree_read_time_lock); | ||
255 | 321 | ||
256 | return; | 322 | return; |
257 | err: | 323 | err: |
@@ -327,7 +393,7 @@ static void do_btree_node_write(struct btree *b) | |||
327 | b->bio = bch_bbio_alloc(b->c); | 393 | b->bio = bch_bbio_alloc(b->c); |
328 | 394 | ||
329 | b->bio->bi_end_io = btree_node_write_endio; | 395 | b->bio->bi_end_io = btree_node_write_endio; |
330 | b->bio->bi_private = &b->io.cl; | 396 | b->bio->bi_private = cl; |
331 | b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; | 397 | b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; |
332 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); | 398 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); |
333 | bch_bio_map(b->bio, i); | 399 | bch_bio_map(b->bio, i); |
@@ -383,7 +449,7 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) | |||
383 | BUG_ON(b->written >= btree_blocks(b)); | 449 | BUG_ON(b->written >= btree_blocks(b)); |
384 | BUG_ON(b->written && !i->keys); | 450 | BUG_ON(b->written && !i->keys); |
385 | BUG_ON(b->sets->data->seq != i->seq); | 451 | BUG_ON(b->sets->data->seq != i->seq); |
386 | bch_check_key_order(b, i); | 452 | bch_check_keys(b, "writing"); |
387 | 453 | ||
388 | cancel_delayed_work(&b->work); | 454 | cancel_delayed_work(&b->work); |
389 | 455 | ||
@@ -405,6 +471,15 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) | |||
405 | bch_bset_init_next(b); | 471 | bch_bset_init_next(b); |
406 | } | 472 | } |
407 | 473 | ||
474 | static void bch_btree_node_write_sync(struct btree *b) | ||
475 | { | ||
476 | struct closure cl; | ||
477 | |||
478 | closure_init_stack(&cl); | ||
479 | bch_btree_node_write(b, &cl); | ||
480 | closure_sync(&cl); | ||
481 | } | ||
482 | |||
408 | static void btree_node_write_work(struct work_struct *w) | 483 | static void btree_node_write_work(struct work_struct *w) |
409 | { | 484 | { |
410 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); | 485 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); |
@@ -416,7 +491,7 @@ static void btree_node_write_work(struct work_struct *w) | |||
416 | rw_unlock(true, b); | 491 | rw_unlock(true, b); |
417 | } | 492 | } |
418 | 493 | ||
419 | static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) | 494 | static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) |
420 | { | 495 | { |
421 | struct bset *i = b->sets[b->nsets].data; | 496 | struct bset *i = b->sets[b->nsets].data; |
422 | struct btree_write *w = btree_current_write(b); | 497 | struct btree_write *w = btree_current_write(b); |
@@ -429,15 +504,15 @@ static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) | |||
429 | 504 | ||
430 | set_btree_node_dirty(b); | 505 | set_btree_node_dirty(b); |
431 | 506 | ||
432 | if (op && op->journal) { | 507 | if (journal_ref) { |
433 | if (w->journal && | 508 | if (w->journal && |
434 | journal_pin_cmp(b->c, w, op)) { | 509 | journal_pin_cmp(b->c, w->journal, journal_ref)) { |
435 | atomic_dec_bug(w->journal); | 510 | atomic_dec_bug(w->journal); |
436 | w->journal = NULL; | 511 | w->journal = NULL; |
437 | } | 512 | } |
438 | 513 | ||
439 | if (!w->journal) { | 514 | if (!w->journal) { |
440 | w->journal = op->journal; | 515 | w->journal = journal_ref; |
441 | atomic_inc(w->journal); | 516 | atomic_inc(w->journal); |
442 | } | 517 | } |
443 | } | 518 | } |
@@ -566,33 +641,32 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, | |||
566 | return b; | 641 | return b; |
567 | } | 642 | } |
568 | 643 | ||
569 | static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) | 644 | static int mca_reap(struct btree *b, unsigned min_order, bool flush) |
570 | { | 645 | { |
646 | struct closure cl; | ||
647 | |||
648 | closure_init_stack(&cl); | ||
571 | lockdep_assert_held(&b->c->bucket_lock); | 649 | lockdep_assert_held(&b->c->bucket_lock); |
572 | 650 | ||
573 | if (!down_write_trylock(&b->lock)) | 651 | if (!down_write_trylock(&b->lock)) |
574 | return -ENOMEM; | 652 | return -ENOMEM; |
575 | 653 | ||
576 | if (b->page_order < min_order) { | 654 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); |
655 | |||
656 | if (b->page_order < min_order || | ||
657 | (!flush && | ||
658 | (btree_node_dirty(b) || | ||
659 | atomic_read(&b->io.cl.remaining) != -1))) { | ||
577 | rw_unlock(true, b); | 660 | rw_unlock(true, b); |
578 | return -ENOMEM; | 661 | return -ENOMEM; |
579 | } | 662 | } |
580 | 663 | ||
581 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); | 664 | if (btree_node_dirty(b)) |
582 | 665 | bch_btree_node_write_sync(b); | |
583 | if (cl && btree_node_dirty(b)) | ||
584 | bch_btree_node_write(b, NULL); | ||
585 | |||
586 | if (cl) | ||
587 | closure_wait_event_async(&b->io.wait, cl, | ||
588 | atomic_read(&b->io.cl.remaining) == -1); | ||
589 | 666 | ||
590 | if (btree_node_dirty(b) || | 667 | /* wait for any in flight btree write */ |
591 | !closure_is_unlocked(&b->io.cl) || | 668 | closure_wait_event(&b->io.wait, &cl, |
592 | work_pending(&b->work.work)) { | 669 | atomic_read(&b->io.cl.remaining) == -1); |
593 | rw_unlock(true, b); | ||
594 | return -EAGAIN; | ||
595 | } | ||
596 | 670 | ||
597 | return 0; | 671 | return 0; |
598 | } | 672 | } |
@@ -633,7 +707,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, | |||
633 | break; | 707 | break; |
634 | 708 | ||
635 | if (++i > 3 && | 709 | if (++i > 3 && |
636 | !mca_reap(b, NULL, 0)) { | 710 | !mca_reap(b, 0, false)) { |
637 | mca_data_free(b); | 711 | mca_data_free(b); |
638 | rw_unlock(true, b); | 712 | rw_unlock(true, b); |
639 | freed++; | 713 | freed++; |
@@ -652,7 +726,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, | |||
652 | list_rotate_left(&c->btree_cache); | 726 | list_rotate_left(&c->btree_cache); |
653 | 727 | ||
654 | if (!b->accessed && | 728 | if (!b->accessed && |
655 | !mca_reap(b, NULL, 0)) { | 729 | !mca_reap(b, 0, false)) { |
656 | mca_bucket_free(b); | 730 | mca_bucket_free(b); |
657 | mca_data_free(b); | 731 | mca_data_free(b); |
658 | rw_unlock(true, b); | 732 | rw_unlock(true, b); |
@@ -723,12 +797,9 @@ int bch_btree_cache_alloc(struct cache_set *c) | |||
723 | { | 797 | { |
724 | unsigned i; | 798 | unsigned i; |
725 | 799 | ||
726 | /* XXX: doesn't check for errors */ | ||
727 | |||
728 | closure_init_unlocked(&c->gc); | ||
729 | |||
730 | for (i = 0; i < mca_reserve(c); i++) | 800 | for (i = 0; i < mca_reserve(c); i++) |
731 | mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); | 801 | if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) |
802 | return -ENOMEM; | ||
732 | 803 | ||
733 | list_splice_init(&c->btree_cache, | 804 | list_splice_init(&c->btree_cache, |
734 | &c->btree_cache_freeable); | 805 | &c->btree_cache_freeable); |
@@ -775,52 +846,27 @@ out: | |||
775 | return b; | 846 | return b; |
776 | } | 847 | } |
777 | 848 | ||
778 | static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, | 849 | static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) |
779 | int level, struct closure *cl) | ||
780 | { | 850 | { |
781 | int ret = -ENOMEM; | 851 | struct btree *b; |
782 | struct btree *i; | ||
783 | 852 | ||
784 | trace_bcache_btree_cache_cannibalize(c); | 853 | trace_bcache_btree_cache_cannibalize(c); |
785 | 854 | ||
786 | if (!cl) | 855 | if (!c->try_harder) { |
787 | return ERR_PTR(-ENOMEM); | 856 | c->try_harder = current; |
788 | 857 | c->try_harder_start = local_clock(); | |
789 | /* | 858 | } else if (c->try_harder != current) |
790 | * Trying to free up some memory - i.e. reuse some btree nodes - may | 859 | return ERR_PTR(-ENOSPC); |
791 | * require initiating IO to flush the dirty part of the node. If we're | ||
792 | * running under generic_make_request(), that IO will never finish and | ||
793 | * we would deadlock. Returning -EAGAIN causes the cache lookup code to | ||
794 | * punt to workqueue and retry. | ||
795 | */ | ||
796 | if (current->bio_list) | ||
797 | return ERR_PTR(-EAGAIN); | ||
798 | |||
799 | if (c->try_harder && c->try_harder != cl) { | ||
800 | closure_wait_event_async(&c->try_wait, cl, !c->try_harder); | ||
801 | return ERR_PTR(-EAGAIN); | ||
802 | } | ||
803 | 860 | ||
804 | c->try_harder = cl; | 861 | list_for_each_entry_reverse(b, &c->btree_cache, list) |
805 | c->try_harder_start = local_clock(); | 862 | if (!mca_reap(b, btree_order(k), false)) |
806 | retry: | 863 | return b; |
807 | list_for_each_entry_reverse(i, &c->btree_cache, list) { | ||
808 | int r = mca_reap(i, cl, btree_order(k)); | ||
809 | if (!r) | ||
810 | return i; | ||
811 | if (r != -ENOMEM) | ||
812 | ret = r; | ||
813 | } | ||
814 | 864 | ||
815 | if (ret == -EAGAIN && | 865 | list_for_each_entry_reverse(b, &c->btree_cache, list) |
816 | closure_blocking(cl)) { | 866 | if (!mca_reap(b, btree_order(k), true)) |
817 | mutex_unlock(&c->bucket_lock); | 867 | return b; |
818 | closure_sync(cl); | ||
819 | mutex_lock(&c->bucket_lock); | ||
820 | goto retry; | ||
821 | } | ||
822 | 868 | ||
823 | return ERR_PTR(ret); | 869 | return ERR_PTR(-ENOMEM); |
824 | } | 870 | } |
825 | 871 | ||
826 | /* | 872 | /* |
@@ -829,20 +875,21 @@ retry: | |||
829 | * cannibalize_bucket() will take. This means every time we unlock the root of | 875 | * cannibalize_bucket() will take. This means every time we unlock the root of |
830 | * the btree, we need to release this lock if we have it held. | 876 | * the btree, we need to release this lock if we have it held. |
831 | */ | 877 | */ |
832 | void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) | 878 | static void bch_cannibalize_unlock(struct cache_set *c) |
833 | { | 879 | { |
834 | if (c->try_harder == cl) { | 880 | if (c->try_harder == current) { |
835 | bch_time_stats_update(&c->try_harder_time, c->try_harder_start); | 881 | bch_time_stats_update(&c->try_harder_time, c->try_harder_start); |
836 | c->try_harder = NULL; | 882 | c->try_harder = NULL; |
837 | __closure_wake_up(&c->try_wait); | 883 | wake_up(&c->try_wait); |
838 | } | 884 | } |
839 | } | 885 | } |
840 | 886 | ||
841 | static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, | 887 | static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) |
842 | int level, struct closure *cl) | ||
843 | { | 888 | { |
844 | struct btree *b; | 889 | struct btree *b; |
845 | 890 | ||
891 | BUG_ON(current->bio_list); | ||
892 | |||
846 | lockdep_assert_held(&c->bucket_lock); | 893 | lockdep_assert_held(&c->bucket_lock); |
847 | 894 | ||
848 | if (mca_find(c, k)) | 895 | if (mca_find(c, k)) |
@@ -852,14 +899,14 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, | |||
852 | * the list. Check if there's any freed nodes there: | 899 | * the list. Check if there's any freed nodes there: |
853 | */ | 900 | */ |
854 | list_for_each_entry(b, &c->btree_cache_freeable, list) | 901 | list_for_each_entry(b, &c->btree_cache_freeable, list) |
855 | if (!mca_reap(b, NULL, btree_order(k))) | 902 | if (!mca_reap(b, btree_order(k), false)) |
856 | goto out; | 903 | goto out; |
857 | 904 | ||
858 | /* We never free struct btree itself, just the memory that holds the on | 905 | /* We never free struct btree itself, just the memory that holds the on |
859 | * disk node. Check the freed list before allocating a new one: | 906 | * disk node. Check the freed list before allocating a new one: |
860 | */ | 907 | */ |
861 | list_for_each_entry(b, &c->btree_cache_freed, list) | 908 | list_for_each_entry(b, &c->btree_cache_freed, list) |
862 | if (!mca_reap(b, NULL, 0)) { | 909 | if (!mca_reap(b, 0, false)) { |
863 | mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); | 910 | mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); |
864 | if (!b->sets[0].data) | 911 | if (!b->sets[0].data) |
865 | goto err; | 912 | goto err; |
@@ -884,6 +931,7 @@ out: | |||
884 | 931 | ||
885 | lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); | 932 | lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); |
886 | b->level = level; | 933 | b->level = level; |
934 | b->parent = (void *) ~0UL; | ||
887 | 935 | ||
888 | mca_reinit(b); | 936 | mca_reinit(b); |
889 | 937 | ||
@@ -892,7 +940,7 @@ err: | |||
892 | if (b) | 940 | if (b) |
893 | rw_unlock(true, b); | 941 | rw_unlock(true, b); |
894 | 942 | ||
895 | b = mca_cannibalize(c, k, level, cl); | 943 | b = mca_cannibalize(c, k); |
896 | if (!IS_ERR(b)) | 944 | if (!IS_ERR(b)) |
897 | goto out; | 945 | goto out; |
898 | 946 | ||
@@ -903,17 +951,15 @@ err: | |||
903 | * bch_btree_node_get - find a btree node in the cache and lock it, reading it | 951 | * bch_btree_node_get - find a btree node in the cache and lock it, reading it |
904 | * in from disk if necessary. | 952 | * in from disk if necessary. |
905 | * | 953 | * |
906 | * If IO is necessary, it uses the closure embedded in struct btree_op to wait; | 954 | * If IO is necessary and running under generic_make_request, returns -EAGAIN. |
907 | * if that closure is in non blocking mode, will return -EAGAIN. | ||
908 | * | 955 | * |
909 | * The btree node will have either a read or a write lock held, depending on | 956 | * The btree node will have either a read or a write lock held, depending on |
910 | * level and op->lock. | 957 | * level and op->lock. |
911 | */ | 958 | */ |
912 | struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, | 959 | struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, |
913 | int level, struct btree_op *op) | 960 | int level, bool write) |
914 | { | 961 | { |
915 | int i = 0; | 962 | int i = 0; |
916 | bool write = level <= op->lock; | ||
917 | struct btree *b; | 963 | struct btree *b; |
918 | 964 | ||
919 | BUG_ON(level < 0); | 965 | BUG_ON(level < 0); |
@@ -925,7 +971,7 @@ retry: | |||
925 | return ERR_PTR(-EAGAIN); | 971 | return ERR_PTR(-EAGAIN); |
926 | 972 | ||
927 | mutex_lock(&c->bucket_lock); | 973 | mutex_lock(&c->bucket_lock); |
928 | b = mca_alloc(c, k, level, &op->cl); | 974 | b = mca_alloc(c, k, level); |
929 | mutex_unlock(&c->bucket_lock); | 975 | mutex_unlock(&c->bucket_lock); |
930 | 976 | ||
931 | if (!b) | 977 | if (!b) |
@@ -971,7 +1017,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) | |||
971 | struct btree *b; | 1017 | struct btree *b; |
972 | 1018 | ||
973 | mutex_lock(&c->bucket_lock); | 1019 | mutex_lock(&c->bucket_lock); |
974 | b = mca_alloc(c, k, level, NULL); | 1020 | b = mca_alloc(c, k, level); |
975 | mutex_unlock(&c->bucket_lock); | 1021 | mutex_unlock(&c->bucket_lock); |
976 | 1022 | ||
977 | if (!IS_ERR_OR_NULL(b)) { | 1023 | if (!IS_ERR_OR_NULL(b)) { |
@@ -982,17 +1028,12 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) | |||
982 | 1028 | ||
983 | /* Btree alloc */ | 1029 | /* Btree alloc */ |
984 | 1030 | ||
985 | static void btree_node_free(struct btree *b, struct btree_op *op) | 1031 | static void btree_node_free(struct btree *b) |
986 | { | 1032 | { |
987 | unsigned i; | 1033 | unsigned i; |
988 | 1034 | ||
989 | trace_bcache_btree_node_free(b); | 1035 | trace_bcache_btree_node_free(b); |
990 | 1036 | ||
991 | /* | ||
992 | * The BUG_ON() in btree_node_get() implies that we must have a write | ||
993 | * lock on parent to free or even invalidate a node | ||
994 | */ | ||
995 | BUG_ON(op->lock <= b->level); | ||
996 | BUG_ON(b == b->c->root); | 1037 | BUG_ON(b == b->c->root); |
997 | 1038 | ||
998 | if (btree_node_dirty(b)) | 1039 | if (btree_node_dirty(b)) |
@@ -1015,27 +1056,26 @@ static void btree_node_free(struct btree *b, struct btree_op *op) | |||
1015 | mutex_unlock(&b->c->bucket_lock); | 1056 | mutex_unlock(&b->c->bucket_lock); |
1016 | } | 1057 | } |
1017 | 1058 | ||
1018 | struct btree *bch_btree_node_alloc(struct cache_set *c, int level, | 1059 | struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait) |
1019 | struct closure *cl) | ||
1020 | { | 1060 | { |
1021 | BKEY_PADDED(key) k; | 1061 | BKEY_PADDED(key) k; |
1022 | struct btree *b = ERR_PTR(-EAGAIN); | 1062 | struct btree *b = ERR_PTR(-EAGAIN); |
1023 | 1063 | ||
1024 | mutex_lock(&c->bucket_lock); | 1064 | mutex_lock(&c->bucket_lock); |
1025 | retry: | 1065 | retry: |
1026 | if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) | 1066 | if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait)) |
1027 | goto err; | 1067 | goto err; |
1028 | 1068 | ||
1069 | bkey_put(c, &k.key); | ||
1029 | SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); | 1070 | SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); |
1030 | 1071 | ||
1031 | b = mca_alloc(c, &k.key, level, cl); | 1072 | b = mca_alloc(c, &k.key, level); |
1032 | if (IS_ERR(b)) | 1073 | if (IS_ERR(b)) |
1033 | goto err_free; | 1074 | goto err_free; |
1034 | 1075 | ||
1035 | if (!b) { | 1076 | if (!b) { |
1036 | cache_bug(c, | 1077 | cache_bug(c, |
1037 | "Tried to allocate bucket that was in btree cache"); | 1078 | "Tried to allocate bucket that was in btree cache"); |
1038 | __bkey_put(c, &k.key); | ||
1039 | goto retry; | 1079 | goto retry; |
1040 | } | 1080 | } |
1041 | 1081 | ||
@@ -1048,7 +1088,6 @@ retry: | |||
1048 | return b; | 1088 | return b; |
1049 | err_free: | 1089 | err_free: |
1050 | bch_bucket_free(c, &k.key); | 1090 | bch_bucket_free(c, &k.key); |
1051 | __bkey_put(c, &k.key); | ||
1052 | err: | 1091 | err: |
1053 | mutex_unlock(&c->bucket_lock); | 1092 | mutex_unlock(&c->bucket_lock); |
1054 | 1093 | ||
@@ -1056,16 +1095,31 @@ err: | |||
1056 | return b; | 1095 | return b; |
1057 | } | 1096 | } |
1058 | 1097 | ||
1059 | static struct btree *btree_node_alloc_replacement(struct btree *b, | 1098 | static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait) |
1060 | struct closure *cl) | ||
1061 | { | 1099 | { |
1062 | struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); | 1100 | struct btree *n = bch_btree_node_alloc(b->c, b->level, wait); |
1063 | if (!IS_ERR_OR_NULL(n)) | 1101 | if (!IS_ERR_OR_NULL(n)) |
1064 | bch_btree_sort_into(b, n); | 1102 | bch_btree_sort_into(b, n); |
1065 | 1103 | ||
1066 | return n; | 1104 | return n; |
1067 | } | 1105 | } |
1068 | 1106 | ||
1107 | static void make_btree_freeing_key(struct btree *b, struct bkey *k) | ||
1108 | { | ||
1109 | unsigned i; | ||
1110 | |||
1111 | bkey_copy(k, &b->key); | ||
1112 | bkey_copy_key(k, &ZERO_KEY); | ||
1113 | |||
1114 | for (i = 0; i < KEY_PTRS(k); i++) { | ||
1115 | uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1; | ||
1116 | |||
1117 | SET_PTR_GEN(k, i, g); | ||
1118 | } | ||
1119 | |||
1120 | atomic_inc(&b->c->prio_blocked); | ||
1121 | } | ||
1122 | |||
1069 | /* Garbage collection */ | 1123 | /* Garbage collection */ |
1070 | 1124 | ||
1071 | uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | 1125 | uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) |
@@ -1119,12 +1173,10 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) | |||
1119 | 1173 | ||
1120 | #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) | 1174 | #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) |
1121 | 1175 | ||
1122 | static int btree_gc_mark_node(struct btree *b, unsigned *keys, | 1176 | static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) |
1123 | struct gc_stat *gc) | ||
1124 | { | 1177 | { |
1125 | uint8_t stale = 0; | 1178 | uint8_t stale = 0; |
1126 | unsigned last_dev = -1; | 1179 | unsigned keys = 0, good_keys = 0; |
1127 | struct bcache_device *d = NULL; | ||
1128 | struct bkey *k; | 1180 | struct bkey *k; |
1129 | struct btree_iter iter; | 1181 | struct btree_iter iter; |
1130 | struct bset_tree *t; | 1182 | struct bset_tree *t; |
@@ -1132,27 +1184,17 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, | |||
1132 | gc->nodes++; | 1184 | gc->nodes++; |
1133 | 1185 | ||
1134 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { | 1186 | for_each_key_filter(b, k, &iter, bch_ptr_invalid) { |
1135 | if (last_dev != KEY_INODE(k)) { | ||
1136 | last_dev = KEY_INODE(k); | ||
1137 | |||
1138 | d = KEY_INODE(k) < b->c->nr_uuids | ||
1139 | ? b->c->devices[last_dev] | ||
1140 | : NULL; | ||
1141 | } | ||
1142 | |||
1143 | stale = max(stale, btree_mark_key(b, k)); | 1187 | stale = max(stale, btree_mark_key(b, k)); |
1188 | keys++; | ||
1144 | 1189 | ||
1145 | if (bch_ptr_bad(b, k)) | 1190 | if (bch_ptr_bad(b, k)) |
1146 | continue; | 1191 | continue; |
1147 | 1192 | ||
1148 | *keys += bkey_u64s(k); | ||
1149 | |||
1150 | gc->key_bytes += bkey_u64s(k); | 1193 | gc->key_bytes += bkey_u64s(k); |
1151 | gc->nkeys++; | 1194 | gc->nkeys++; |
1195 | good_keys++; | ||
1152 | 1196 | ||
1153 | gc->data += KEY_SIZE(k); | 1197 | gc->data += KEY_SIZE(k); |
1154 | if (KEY_DIRTY(k)) | ||
1155 | gc->dirty += KEY_SIZE(k); | ||
1156 | } | 1198 | } |
1157 | 1199 | ||
1158 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | 1200 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) |
@@ -1161,78 +1203,74 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, | |||
1161 | bkey_cmp(&b->key, &t->end) < 0, | 1203 | bkey_cmp(&b->key, &t->end) < 0, |
1162 | b, "found short btree key in gc"); | 1204 | b, "found short btree key in gc"); |
1163 | 1205 | ||
1164 | return stale; | 1206 | if (b->c->gc_always_rewrite) |
1165 | } | 1207 | return true; |
1166 | |||
1167 | static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, | ||
1168 | struct btree_op *op) | ||
1169 | { | ||
1170 | /* | ||
1171 | * We block priorities from being written for the duration of garbage | ||
1172 | * collection, so we can't sleep in btree_alloc() -> | ||
1173 | * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it | ||
1174 | * our closure. | ||
1175 | */ | ||
1176 | struct btree *n = btree_node_alloc_replacement(b, NULL); | ||
1177 | |||
1178 | if (!IS_ERR_OR_NULL(n)) { | ||
1179 | swap(b, n); | ||
1180 | __bkey_put(b->c, &b->key); | ||
1181 | 1208 | ||
1182 | memcpy(k->ptr, b->key.ptr, | 1209 | if (stale > 10) |
1183 | sizeof(uint64_t) * KEY_PTRS(&b->key)); | 1210 | return true; |
1184 | 1211 | ||
1185 | btree_node_free(n, op); | 1212 | if ((keys - good_keys) * 2 > keys) |
1186 | up_write(&n->lock); | 1213 | return true; |
1187 | } | ||
1188 | 1214 | ||
1189 | return b; | 1215 | return false; |
1190 | } | 1216 | } |
1191 | 1217 | ||
1192 | /* | 1218 | #define GC_MERGE_NODES 4U |
1193 | * Leaving this at 2 until we've got incremental garbage collection done; it | ||
1194 | * could be higher (and has been tested with 4) except that garbage collection | ||
1195 | * could take much longer, adversely affecting latency. | ||
1196 | */ | ||
1197 | #define GC_MERGE_NODES 2U | ||
1198 | 1219 | ||
1199 | struct gc_merge_info { | 1220 | struct gc_merge_info { |
1200 | struct btree *b; | 1221 | struct btree *b; |
1201 | struct bkey *k; | ||
1202 | unsigned keys; | 1222 | unsigned keys; |
1203 | }; | 1223 | }; |
1204 | 1224 | ||
1205 | static void btree_gc_coalesce(struct btree *b, struct btree_op *op, | 1225 | static int bch_btree_insert_node(struct btree *, struct btree_op *, |
1206 | struct gc_stat *gc, struct gc_merge_info *r) | 1226 | struct keylist *, atomic_t *, struct bkey *); |
1227 | |||
1228 | static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | ||
1229 | struct keylist *keylist, struct gc_stat *gc, | ||
1230 | struct gc_merge_info *r) | ||
1207 | { | 1231 | { |
1208 | unsigned nodes = 0, keys = 0, blocks; | 1232 | unsigned i, nodes = 0, keys = 0, blocks; |
1209 | int i; | 1233 | struct btree *new_nodes[GC_MERGE_NODES]; |
1234 | struct closure cl; | ||
1235 | struct bkey *k; | ||
1236 | |||
1237 | memset(new_nodes, 0, sizeof(new_nodes)); | ||
1238 | closure_init_stack(&cl); | ||
1210 | 1239 | ||
1211 | while (nodes < GC_MERGE_NODES && r[nodes].b) | 1240 | while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) |
1212 | keys += r[nodes++].keys; | 1241 | keys += r[nodes++].keys; |
1213 | 1242 | ||
1214 | blocks = btree_default_blocks(b->c) * 2 / 3; | 1243 | blocks = btree_default_blocks(b->c) * 2 / 3; |
1215 | 1244 | ||
1216 | if (nodes < 2 || | 1245 | if (nodes < 2 || |
1217 | __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) | 1246 | __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) |
1218 | return; | 1247 | return 0; |
1219 | |||
1220 | for (i = nodes - 1; i >= 0; --i) { | ||
1221 | if (r[i].b->written) | ||
1222 | r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); | ||
1223 | 1248 | ||
1224 | if (r[i].b->written) | 1249 | for (i = 0; i < nodes; i++) { |
1225 | return; | 1250 | new_nodes[i] = btree_node_alloc_replacement(r[i].b, false); |
1251 | if (IS_ERR_OR_NULL(new_nodes[i])) | ||
1252 | goto out_nocoalesce; | ||
1226 | } | 1253 | } |
1227 | 1254 | ||
1228 | for (i = nodes - 1; i > 0; --i) { | 1255 | for (i = nodes - 1; i > 0; --i) { |
1229 | struct bset *n1 = r[i].b->sets->data; | 1256 | struct bset *n1 = new_nodes[i]->sets->data; |
1230 | struct bset *n2 = r[i - 1].b->sets->data; | 1257 | struct bset *n2 = new_nodes[i - 1]->sets->data; |
1231 | struct bkey *k, *last = NULL; | 1258 | struct bkey *k, *last = NULL; |
1232 | 1259 | ||
1233 | keys = 0; | 1260 | keys = 0; |
1234 | 1261 | ||
1235 | if (i == 1) { | 1262 | if (i > 1) { |
1263 | for (k = n2->start; | ||
1264 | k < end(n2); | ||
1265 | k = bkey_next(k)) { | ||
1266 | if (__set_blocks(n1, n1->keys + keys + | ||
1267 | bkey_u64s(k), b->c) > blocks) | ||
1268 | break; | ||
1269 | |||
1270 | last = k; | ||
1271 | keys += bkey_u64s(k); | ||
1272 | } | ||
1273 | } else { | ||
1236 | /* | 1274 | /* |
1237 | * Last node we're not getting rid of - we're getting | 1275 | * Last node we're not getting rid of - we're getting |
1238 | * rid of the node at r[0]. Have to try and fit all of | 1276 | * rid of the node at r[0]. Have to try and fit all of |
@@ -1241,37 +1279,27 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1241 | * length keys (shouldn't be possible in practice, | 1279 | * length keys (shouldn't be possible in practice, |
1242 | * though) | 1280 | * though) |
1243 | */ | 1281 | */ |
1244 | if (__set_blocks(n1, n1->keys + r->keys, | 1282 | if (__set_blocks(n1, n1->keys + n2->keys, |
1245 | b->c) > btree_blocks(r[i].b)) | 1283 | b->c) > btree_blocks(new_nodes[i])) |
1246 | return; | 1284 | goto out_nocoalesce; |
1247 | 1285 | ||
1248 | keys = n2->keys; | 1286 | keys = n2->keys; |
1287 | /* Take the key of the node we're getting rid of */ | ||
1249 | last = &r->b->key; | 1288 | last = &r->b->key; |
1250 | } else | 1289 | } |
1251 | for (k = n2->start; | ||
1252 | k < end(n2); | ||
1253 | k = bkey_next(k)) { | ||
1254 | if (__set_blocks(n1, n1->keys + keys + | ||
1255 | bkey_u64s(k), b->c) > blocks) | ||
1256 | break; | ||
1257 | |||
1258 | last = k; | ||
1259 | keys += bkey_u64s(k); | ||
1260 | } | ||
1261 | 1290 | ||
1262 | BUG_ON(__set_blocks(n1, n1->keys + keys, | 1291 | BUG_ON(__set_blocks(n1, n1->keys + keys, |
1263 | b->c) > btree_blocks(r[i].b)); | 1292 | b->c) > btree_blocks(new_nodes[i])); |
1264 | 1293 | ||
1265 | if (last) { | 1294 | if (last) |
1266 | bkey_copy_key(&r[i].b->key, last); | 1295 | bkey_copy_key(&new_nodes[i]->key, last); |
1267 | bkey_copy_key(r[i].k, last); | ||
1268 | } | ||
1269 | 1296 | ||
1270 | memcpy(end(n1), | 1297 | memcpy(end(n1), |
1271 | n2->start, | 1298 | n2->start, |
1272 | (void *) node(n2, keys) - (void *) n2->start); | 1299 | (void *) node(n2, keys) - (void *) n2->start); |
1273 | 1300 | ||
1274 | n1->keys += keys; | 1301 | n1->keys += keys; |
1302 | r[i].keys = n1->keys; | ||
1275 | 1303 | ||
1276 | memmove(n2->start, | 1304 | memmove(n2->start, |
1277 | node(n2, keys), | 1305 | node(n2, keys), |
@@ -1279,95 +1307,176 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1279 | 1307 | ||
1280 | n2->keys -= keys; | 1308 | n2->keys -= keys; |
1281 | 1309 | ||
1282 | r[i].keys = n1->keys; | 1310 | if (bch_keylist_realloc(keylist, |
1283 | r[i - 1].keys = n2->keys; | 1311 | KEY_PTRS(&new_nodes[i]->key), b->c)) |
1312 | goto out_nocoalesce; | ||
1313 | |||
1314 | bch_btree_node_write(new_nodes[i], &cl); | ||
1315 | bch_keylist_add(keylist, &new_nodes[i]->key); | ||
1284 | } | 1316 | } |
1285 | 1317 | ||
1286 | btree_node_free(r->b, op); | 1318 | for (i = 0; i < nodes; i++) { |
1287 | up_write(&r->b->lock); | 1319 | if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c)) |
1320 | goto out_nocoalesce; | ||
1288 | 1321 | ||
1289 | trace_bcache_btree_gc_coalesce(nodes); | 1322 | make_btree_freeing_key(r[i].b, keylist->top); |
1323 | bch_keylist_push(keylist); | ||
1324 | } | ||
1325 | |||
1326 | /* We emptied out this node */ | ||
1327 | BUG_ON(new_nodes[0]->sets->data->keys); | ||
1328 | btree_node_free(new_nodes[0]); | ||
1329 | rw_unlock(true, new_nodes[0]); | ||
1330 | |||
1331 | closure_sync(&cl); | ||
1332 | |||
1333 | for (i = 0; i < nodes; i++) { | ||
1334 | btree_node_free(r[i].b); | ||
1335 | rw_unlock(true, r[i].b); | ||
1336 | |||
1337 | r[i].b = new_nodes[i]; | ||
1338 | } | ||
1339 | |||
1340 | bch_btree_insert_node(b, op, keylist, NULL, NULL); | ||
1341 | BUG_ON(!bch_keylist_empty(keylist)); | ||
1342 | |||
1343 | memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); | ||
1344 | r[nodes - 1].b = ERR_PTR(-EINTR); | ||
1290 | 1345 | ||
1346 | trace_bcache_btree_gc_coalesce(nodes); | ||
1291 | gc->nodes--; | 1347 | gc->nodes--; |
1292 | nodes--; | ||
1293 | 1348 | ||
1294 | memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); | 1349 | /* Invalidated our iterator */ |
1295 | memset(&r[nodes], 0, sizeof(struct gc_merge_info)); | 1350 | return -EINTR; |
1351 | |||
1352 | out_nocoalesce: | ||
1353 | closure_sync(&cl); | ||
1354 | |||
1355 | while ((k = bch_keylist_pop(keylist))) | ||
1356 | if (!bkey_cmp(k, &ZERO_KEY)) | ||
1357 | atomic_dec(&b->c->prio_blocked); | ||
1358 | |||
1359 | for (i = 0; i < nodes; i++) | ||
1360 | if (!IS_ERR_OR_NULL(new_nodes[i])) { | ||
1361 | btree_node_free(new_nodes[i]); | ||
1362 | rw_unlock(true, new_nodes[i]); | ||
1363 | } | ||
1364 | return 0; | ||
1296 | } | 1365 | } |
1297 | 1366 | ||
1298 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, | 1367 | static unsigned btree_gc_count_keys(struct btree *b) |
1299 | struct closure *writes, struct gc_stat *gc) | ||
1300 | { | 1368 | { |
1301 | void write(struct btree *r) | 1369 | struct bkey *k; |
1302 | { | 1370 | struct btree_iter iter; |
1303 | if (!r->written) | 1371 | unsigned ret = 0; |
1304 | bch_btree_node_write(r, &op->cl); | ||
1305 | else if (btree_node_dirty(r)) | ||
1306 | bch_btree_node_write(r, writes); | ||
1307 | 1372 | ||
1308 | up_write(&r->lock); | 1373 | for_each_key_filter(b, k, &iter, bch_ptr_bad) |
1309 | } | 1374 | ret += bkey_u64s(k); |
1375 | |||
1376 | return ret; | ||
1377 | } | ||
1310 | 1378 | ||
1311 | int ret = 0, stale; | 1379 | static int btree_gc_recurse(struct btree *b, struct btree_op *op, |
1380 | struct closure *writes, struct gc_stat *gc) | ||
1381 | { | ||
1312 | unsigned i; | 1382 | unsigned i; |
1383 | int ret = 0; | ||
1384 | bool should_rewrite; | ||
1385 | struct btree *n; | ||
1386 | struct bkey *k; | ||
1387 | struct keylist keys; | ||
1388 | struct btree_iter iter; | ||
1313 | struct gc_merge_info r[GC_MERGE_NODES]; | 1389 | struct gc_merge_info r[GC_MERGE_NODES]; |
1390 | struct gc_merge_info *last = r + GC_MERGE_NODES - 1; | ||
1314 | 1391 | ||
1315 | memset(r, 0, sizeof(r)); | 1392 | bch_keylist_init(&keys); |
1393 | bch_btree_iter_init(b, &iter, &b->c->gc_done); | ||
1316 | 1394 | ||
1317 | while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { | 1395 | for (i = 0; i < GC_MERGE_NODES; i++) |
1318 | r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); | 1396 | r[i].b = ERR_PTR(-EINTR); |
1319 | 1397 | ||
1320 | if (IS_ERR(r->b)) { | 1398 | while (1) { |
1321 | ret = PTR_ERR(r->b); | 1399 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); |
1322 | break; | 1400 | if (k) { |
1401 | r->b = bch_btree_node_get(b->c, k, b->level - 1, true); | ||
1402 | if (IS_ERR(r->b)) { | ||
1403 | ret = PTR_ERR(r->b); | ||
1404 | break; | ||
1405 | } | ||
1406 | |||
1407 | r->keys = btree_gc_count_keys(r->b); | ||
1408 | |||
1409 | ret = btree_gc_coalesce(b, op, &keys, gc, r); | ||
1410 | if (ret) | ||
1411 | break; | ||
1323 | } | 1412 | } |
1324 | 1413 | ||
1325 | r->keys = 0; | 1414 | if (!last->b) |
1326 | stale = btree_gc_mark_node(r->b, &r->keys, gc); | 1415 | break; |
1327 | 1416 | ||
1328 | if (!b->written && | 1417 | if (!IS_ERR(last->b)) { |
1329 | (r->b->level || stale > 10 || | 1418 | should_rewrite = btree_gc_mark_node(last->b, gc); |
1330 | b->c->gc_always_rewrite)) | 1419 | if (should_rewrite) { |
1331 | r->b = btree_gc_alloc(r->b, r->k, op); | 1420 | n = btree_node_alloc_replacement(last->b, |
1421 | false); | ||
1332 | 1422 | ||
1333 | if (r->b->level) | 1423 | if (!IS_ERR_OR_NULL(n)) { |
1334 | ret = btree_gc_recurse(r->b, op, writes, gc); | 1424 | bch_btree_node_write_sync(n); |
1425 | bch_keylist_add(&keys, &n->key); | ||
1335 | 1426 | ||
1336 | if (ret) { | 1427 | make_btree_freeing_key(last->b, |
1337 | write(r->b); | 1428 | keys.top); |
1338 | break; | 1429 | bch_keylist_push(&keys); |
1339 | } | 1430 | |
1431 | btree_node_free(last->b); | ||
1432 | |||
1433 | bch_btree_insert_node(b, op, &keys, | ||
1434 | NULL, NULL); | ||
1435 | BUG_ON(!bch_keylist_empty(&keys)); | ||
1340 | 1436 | ||
1341 | bkey_copy_key(&b->c->gc_done, r->k); | 1437 | rw_unlock(true, last->b); |
1438 | last->b = n; | ||
1342 | 1439 | ||
1343 | if (!b->written) | 1440 | /* Invalidated our iterator */ |
1344 | btree_gc_coalesce(b, op, gc, r); | 1441 | ret = -EINTR; |
1442 | break; | ||
1443 | } | ||
1444 | } | ||
1345 | 1445 | ||
1346 | if (r[GC_MERGE_NODES - 1].b) | 1446 | if (last->b->level) { |
1347 | write(r[GC_MERGE_NODES - 1].b); | 1447 | ret = btree_gc_recurse(last->b, op, writes, gc); |
1448 | if (ret) | ||
1449 | break; | ||
1450 | } | ||
1348 | 1451 | ||
1349 | memmove(&r[1], &r[0], | 1452 | bkey_copy_key(&b->c->gc_done, &last->b->key); |
1350 | sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); | 1453 | |
1454 | /* | ||
1455 | * Must flush leaf nodes before gc ends, since replace | ||
1456 | * operations aren't journalled | ||
1457 | */ | ||
1458 | if (btree_node_dirty(last->b)) | ||
1459 | bch_btree_node_write(last->b, writes); | ||
1460 | rw_unlock(true, last->b); | ||
1461 | } | ||
1462 | |||
1463 | memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); | ||
1464 | r->b = NULL; | ||
1351 | 1465 | ||
1352 | /* When we've got incremental GC working, we'll want to do | ||
1353 | * if (should_resched()) | ||
1354 | * return -EAGAIN; | ||
1355 | */ | ||
1356 | cond_resched(); | ||
1357 | #if 0 | ||
1358 | if (need_resched()) { | 1466 | if (need_resched()) { |
1359 | ret = -EAGAIN; | 1467 | ret = -EAGAIN; |
1360 | break; | 1468 | break; |
1361 | } | 1469 | } |
1362 | #endif | ||
1363 | } | 1470 | } |
1364 | 1471 | ||
1365 | for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) | 1472 | for (i = 0; i < GC_MERGE_NODES; i++) |
1366 | write(r[i].b); | 1473 | if (!IS_ERR_OR_NULL(r[i].b)) { |
1474 | if (btree_node_dirty(r[i].b)) | ||
1475 | bch_btree_node_write(r[i].b, writes); | ||
1476 | rw_unlock(true, r[i].b); | ||
1477 | } | ||
1367 | 1478 | ||
1368 | /* Might have freed some children, must remove their keys */ | 1479 | bch_keylist_free(&keys); |
1369 | if (!b->written) | ||
1370 | bch_btree_sort(b); | ||
1371 | 1480 | ||
1372 | return ret; | 1481 | return ret; |
1373 | } | 1482 | } |
@@ -1376,29 +1485,31 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, | |||
1376 | struct closure *writes, struct gc_stat *gc) | 1485 | struct closure *writes, struct gc_stat *gc) |
1377 | { | 1486 | { |
1378 | struct btree *n = NULL; | 1487 | struct btree *n = NULL; |
1379 | unsigned keys = 0; | 1488 | int ret = 0; |
1380 | int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); | 1489 | bool should_rewrite; |
1381 | |||
1382 | if (b->level || stale > 10) | ||
1383 | n = btree_node_alloc_replacement(b, NULL); | ||
1384 | 1490 | ||
1385 | if (!IS_ERR_OR_NULL(n)) | 1491 | should_rewrite = btree_gc_mark_node(b, gc); |
1386 | swap(b, n); | 1492 | if (should_rewrite) { |
1493 | n = btree_node_alloc_replacement(b, false); | ||
1387 | 1494 | ||
1388 | if (b->level) | 1495 | if (!IS_ERR_OR_NULL(n)) { |
1389 | ret = btree_gc_recurse(b, op, writes, gc); | 1496 | bch_btree_node_write_sync(n); |
1497 | bch_btree_set_root(n); | ||
1498 | btree_node_free(b); | ||
1499 | rw_unlock(true, n); | ||
1390 | 1500 | ||
1391 | if (!b->written || btree_node_dirty(b)) { | 1501 | return -EINTR; |
1392 | bch_btree_node_write(b, n ? &op->cl : NULL); | 1502 | } |
1393 | } | 1503 | } |
1394 | 1504 | ||
1395 | if (!IS_ERR_OR_NULL(n)) { | 1505 | if (b->level) { |
1396 | closure_sync(&op->cl); | 1506 | ret = btree_gc_recurse(b, op, writes, gc); |
1397 | bch_btree_set_root(b); | 1507 | if (ret) |
1398 | btree_node_free(n, op); | 1508 | return ret; |
1399 | rw_unlock(true, b); | ||
1400 | } | 1509 | } |
1401 | 1510 | ||
1511 | bkey_copy_key(&b->c->gc_done, &b->key); | ||
1512 | |||
1402 | return ret; | 1513 | return ret; |
1403 | } | 1514 | } |
1404 | 1515 | ||
@@ -1479,9 +1590,8 @@ size_t bch_btree_gc_finish(struct cache_set *c) | |||
1479 | return available; | 1590 | return available; |
1480 | } | 1591 | } |
1481 | 1592 | ||
1482 | static void bch_btree_gc(struct closure *cl) | 1593 | static void bch_btree_gc(struct cache_set *c) |
1483 | { | 1594 | { |
1484 | struct cache_set *c = container_of(cl, struct cache_set, gc.cl); | ||
1485 | int ret; | 1595 | int ret; |
1486 | unsigned long available; | 1596 | unsigned long available; |
1487 | struct gc_stat stats; | 1597 | struct gc_stat stats; |
@@ -1493,47 +1603,73 @@ static void bch_btree_gc(struct closure *cl) | |||
1493 | 1603 | ||
1494 | memset(&stats, 0, sizeof(struct gc_stat)); | 1604 | memset(&stats, 0, sizeof(struct gc_stat)); |
1495 | closure_init_stack(&writes); | 1605 | closure_init_stack(&writes); |
1496 | bch_btree_op_init_stack(&op); | 1606 | bch_btree_op_init(&op, SHRT_MAX); |
1497 | op.lock = SHRT_MAX; | ||
1498 | 1607 | ||
1499 | btree_gc_start(c); | 1608 | btree_gc_start(c); |
1500 | 1609 | ||
1501 | atomic_inc(&c->prio_blocked); | 1610 | do { |
1502 | 1611 | ret = btree_root(gc_root, c, &op, &writes, &stats); | |
1503 | ret = btree_root(gc_root, c, &op, &writes, &stats); | 1612 | closure_sync(&writes); |
1504 | closure_sync(&op.cl); | ||
1505 | closure_sync(&writes); | ||
1506 | |||
1507 | if (ret) { | ||
1508 | pr_warn("gc failed!"); | ||
1509 | continue_at(cl, bch_btree_gc, bch_gc_wq); | ||
1510 | } | ||
1511 | 1613 | ||
1512 | /* Possibly wait for new UUIDs or whatever to hit disk */ | 1614 | if (ret && ret != -EAGAIN) |
1513 | bch_journal_meta(c, &op.cl); | 1615 | pr_warn("gc failed!"); |
1514 | closure_sync(&op.cl); | 1616 | } while (ret); |
1515 | 1617 | ||
1516 | available = bch_btree_gc_finish(c); | 1618 | available = bch_btree_gc_finish(c); |
1517 | |||
1518 | atomic_dec(&c->prio_blocked); | ||
1519 | wake_up_allocators(c); | 1619 | wake_up_allocators(c); |
1520 | 1620 | ||
1521 | bch_time_stats_update(&c->btree_gc_time, start_time); | 1621 | bch_time_stats_update(&c->btree_gc_time, start_time); |
1522 | 1622 | ||
1523 | stats.key_bytes *= sizeof(uint64_t); | 1623 | stats.key_bytes *= sizeof(uint64_t); |
1524 | stats.dirty <<= 9; | ||
1525 | stats.data <<= 9; | 1624 | stats.data <<= 9; |
1526 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; | 1625 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; |
1527 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); | 1626 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); |
1528 | 1627 | ||
1529 | trace_bcache_gc_end(c); | 1628 | trace_bcache_gc_end(c); |
1530 | 1629 | ||
1531 | continue_at(cl, bch_moving_gc, bch_gc_wq); | 1630 | bch_moving_gc(c); |
1631 | } | ||
1632 | |||
1633 | static int bch_gc_thread(void *arg) | ||
1634 | { | ||
1635 | struct cache_set *c = arg; | ||
1636 | struct cache *ca; | ||
1637 | unsigned i; | ||
1638 | |||
1639 | while (1) { | ||
1640 | again: | ||
1641 | bch_btree_gc(c); | ||
1642 | |||
1643 | set_current_state(TASK_INTERRUPTIBLE); | ||
1644 | if (kthread_should_stop()) | ||
1645 | break; | ||
1646 | |||
1647 | mutex_lock(&c->bucket_lock); | ||
1648 | |||
1649 | for_each_cache(ca, c, i) | ||
1650 | if (ca->invalidate_needs_gc) { | ||
1651 | mutex_unlock(&c->bucket_lock); | ||
1652 | set_current_state(TASK_RUNNING); | ||
1653 | goto again; | ||
1654 | } | ||
1655 | |||
1656 | mutex_unlock(&c->bucket_lock); | ||
1657 | |||
1658 | try_to_freeze(); | ||
1659 | schedule(); | ||
1660 | } | ||
1661 | |||
1662 | return 0; | ||
1532 | } | 1663 | } |
1533 | 1664 | ||
1534 | void bch_queue_gc(struct cache_set *c) | 1665 | int bch_gc_thread_start(struct cache_set *c) |
1535 | { | 1666 | { |
1536 | closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); | 1667 | c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); |
1668 | if (IS_ERR(c->gc_thread)) | ||
1669 | return PTR_ERR(c->gc_thread); | ||
1670 | |||
1671 | set_task_state(c->gc_thread, TASK_INTERRUPTIBLE); | ||
1672 | return 0; | ||
1537 | } | 1673 | } |
1538 | 1674 | ||
1539 | /* Initial partial gc */ | 1675 | /* Initial partial gc */ |
@@ -1541,9 +1677,9 @@ void bch_queue_gc(struct cache_set *c) | |||
1541 | static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, | 1677 | static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, |
1542 | unsigned long **seen) | 1678 | unsigned long **seen) |
1543 | { | 1679 | { |
1544 | int ret; | 1680 | int ret = 0; |
1545 | unsigned i; | 1681 | unsigned i; |
1546 | struct bkey *k; | 1682 | struct bkey *k, *p = NULL; |
1547 | struct bucket *g; | 1683 | struct bucket *g; |
1548 | struct btree_iter iter; | 1684 | struct btree_iter iter; |
1549 | 1685 | ||
@@ -1570,31 +1706,32 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, | |||
1570 | } | 1706 | } |
1571 | 1707 | ||
1572 | if (b->level) { | 1708 | if (b->level) { |
1573 | k = bch_next_recurse_key(b, &ZERO_KEY); | 1709 | bch_btree_iter_init(b, &iter, NULL); |
1574 | 1710 | ||
1575 | while (k) { | 1711 | do { |
1576 | struct bkey *p = bch_next_recurse_key(b, k); | 1712 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); |
1577 | if (p) | 1713 | if (k) |
1578 | btree_node_prefetch(b->c, p, b->level - 1); | 1714 | btree_node_prefetch(b->c, k, b->level - 1); |
1579 | 1715 | ||
1580 | ret = btree(check_recurse, k, b, op, seen); | 1716 | if (p) |
1581 | if (ret) | 1717 | ret = btree(check_recurse, p, b, op, seen); |
1582 | return ret; | ||
1583 | 1718 | ||
1584 | k = p; | 1719 | p = k; |
1585 | } | 1720 | } while (p && !ret); |
1586 | } | 1721 | } |
1587 | 1722 | ||
1588 | return 0; | 1723 | return 0; |
1589 | } | 1724 | } |
1590 | 1725 | ||
1591 | int bch_btree_check(struct cache_set *c, struct btree_op *op) | 1726 | int bch_btree_check(struct cache_set *c) |
1592 | { | 1727 | { |
1593 | int ret = -ENOMEM; | 1728 | int ret = -ENOMEM; |
1594 | unsigned i; | 1729 | unsigned i; |
1595 | unsigned long *seen[MAX_CACHES_PER_SET]; | 1730 | unsigned long *seen[MAX_CACHES_PER_SET]; |
1731 | struct btree_op op; | ||
1596 | 1732 | ||
1597 | memset(seen, 0, sizeof(seen)); | 1733 | memset(seen, 0, sizeof(seen)); |
1734 | bch_btree_op_init(&op, SHRT_MAX); | ||
1598 | 1735 | ||
1599 | for (i = 0; c->cache[i]; i++) { | 1736 | for (i = 0; c->cache[i]; i++) { |
1600 | size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); | 1737 | size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); |
@@ -1606,7 +1743,7 @@ int bch_btree_check(struct cache_set *c, struct btree_op *op) | |||
1606 | memset(seen[i], 0xFF, n); | 1743 | memset(seen[i], 0xFF, n); |
1607 | } | 1744 | } |
1608 | 1745 | ||
1609 | ret = btree_root(check_recurse, c, op, seen); | 1746 | ret = btree_root(check_recurse, c, &op, seen); |
1610 | err: | 1747 | err: |
1611 | for (i = 0; i < MAX_CACHES_PER_SET; i++) | 1748 | for (i = 0; i < MAX_CACHES_PER_SET; i++) |
1612 | kfree(seen[i]); | 1749 | kfree(seen[i]); |
@@ -1628,10 +1765,9 @@ static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) | |||
1628 | bch_bset_fix_lookup_table(b, where); | 1765 | bch_bset_fix_lookup_table(b, where); |
1629 | } | 1766 | } |
1630 | 1767 | ||
1631 | static bool fix_overlapping_extents(struct btree *b, | 1768 | static bool fix_overlapping_extents(struct btree *b, struct bkey *insert, |
1632 | struct bkey *insert, | ||
1633 | struct btree_iter *iter, | 1769 | struct btree_iter *iter, |
1634 | struct btree_op *op) | 1770 | struct bkey *replace_key) |
1635 | { | 1771 | { |
1636 | void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) | 1772 | void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) |
1637 | { | 1773 | { |
@@ -1659,39 +1795,38 @@ static bool fix_overlapping_extents(struct btree *b, | |||
1659 | * We might overlap with 0 size extents; we can't skip these | 1795 | * We might overlap with 0 size extents; we can't skip these |
1660 | * because if they're in the set we're inserting to we have to | 1796 | * because if they're in the set we're inserting to we have to |
1661 | * adjust them so they don't overlap with the key we're | 1797 | * adjust them so they don't overlap with the key we're |
1662 | * inserting. But we don't want to check them for BTREE_REPLACE | 1798 | * inserting. But we don't want to check them for replace |
1663 | * operations. | 1799 | * operations. |
1664 | */ | 1800 | */ |
1665 | 1801 | ||
1666 | if (op->type == BTREE_REPLACE && | 1802 | if (replace_key && KEY_SIZE(k)) { |
1667 | KEY_SIZE(k)) { | ||
1668 | /* | 1803 | /* |
1669 | * k might have been split since we inserted/found the | 1804 | * k might have been split since we inserted/found the |
1670 | * key we're replacing | 1805 | * key we're replacing |
1671 | */ | 1806 | */ |
1672 | unsigned i; | 1807 | unsigned i; |
1673 | uint64_t offset = KEY_START(k) - | 1808 | uint64_t offset = KEY_START(k) - |
1674 | KEY_START(&op->replace); | 1809 | KEY_START(replace_key); |
1675 | 1810 | ||
1676 | /* But it must be a subset of the replace key */ | 1811 | /* But it must be a subset of the replace key */ |
1677 | if (KEY_START(k) < KEY_START(&op->replace) || | 1812 | if (KEY_START(k) < KEY_START(replace_key) || |
1678 | KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) | 1813 | KEY_OFFSET(k) > KEY_OFFSET(replace_key)) |
1679 | goto check_failed; | 1814 | goto check_failed; |
1680 | 1815 | ||
1681 | /* We didn't find a key that we were supposed to */ | 1816 | /* We didn't find a key that we were supposed to */ |
1682 | if (KEY_START(k) > KEY_START(insert) + sectors_found) | 1817 | if (KEY_START(k) > KEY_START(insert) + sectors_found) |
1683 | goto check_failed; | 1818 | goto check_failed; |
1684 | 1819 | ||
1685 | if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) | 1820 | if (KEY_PTRS(replace_key) != KEY_PTRS(k)) |
1686 | goto check_failed; | 1821 | goto check_failed; |
1687 | 1822 | ||
1688 | /* skip past gen */ | 1823 | /* skip past gen */ |
1689 | offset <<= 8; | 1824 | offset <<= 8; |
1690 | 1825 | ||
1691 | BUG_ON(!KEY_PTRS(&op->replace)); | 1826 | BUG_ON(!KEY_PTRS(replace_key)); |
1692 | 1827 | ||
1693 | for (i = 0; i < KEY_PTRS(&op->replace); i++) | 1828 | for (i = 0; i < KEY_PTRS(replace_key); i++) |
1694 | if (k->ptr[i] != op->replace.ptr[i] + offset) | 1829 | if (k->ptr[i] != replace_key->ptr[i] + offset) |
1695 | goto check_failed; | 1830 | goto check_failed; |
1696 | 1831 | ||
1697 | sectors_found = KEY_OFFSET(k) - KEY_START(insert); | 1832 | sectors_found = KEY_OFFSET(k) - KEY_START(insert); |
@@ -1742,6 +1877,9 @@ static bool fix_overlapping_extents(struct btree *b, | |||
1742 | if (bkey_cmp(insert, k) < 0) { | 1877 | if (bkey_cmp(insert, k) < 0) { |
1743 | bch_cut_front(insert, k); | 1878 | bch_cut_front(insert, k); |
1744 | } else { | 1879 | } else { |
1880 | if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) | ||
1881 | old_offset = KEY_START(insert); | ||
1882 | |||
1745 | if (bkey_written(b, k) && | 1883 | if (bkey_written(b, k) && |
1746 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { | 1884 | bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { |
1747 | /* | 1885 | /* |
@@ -1759,9 +1897,8 @@ static bool fix_overlapping_extents(struct btree *b, | |||
1759 | } | 1897 | } |
1760 | 1898 | ||
1761 | check_failed: | 1899 | check_failed: |
1762 | if (op->type == BTREE_REPLACE) { | 1900 | if (replace_key) { |
1763 | if (!sectors_found) { | 1901 | if (!sectors_found) { |
1764 | op->insert_collision = true; | ||
1765 | return true; | 1902 | return true; |
1766 | } else if (sectors_found < KEY_SIZE(insert)) { | 1903 | } else if (sectors_found < KEY_SIZE(insert)) { |
1767 | SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - | 1904 | SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - |
@@ -1774,7 +1911,7 @@ check_failed: | |||
1774 | } | 1911 | } |
1775 | 1912 | ||
1776 | static bool btree_insert_key(struct btree *b, struct btree_op *op, | 1913 | static bool btree_insert_key(struct btree *b, struct btree_op *op, |
1777 | struct bkey *k) | 1914 | struct bkey *k, struct bkey *replace_key) |
1778 | { | 1915 | { |
1779 | struct bset *i = b->sets[b->nsets].data; | 1916 | struct bset *i = b->sets[b->nsets].data; |
1780 | struct bkey *m, *prev; | 1917 | struct bkey *m, *prev; |
@@ -1786,22 +1923,23 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, | |||
1786 | 1923 | ||
1787 | if (!b->level) { | 1924 | if (!b->level) { |
1788 | struct btree_iter iter; | 1925 | struct btree_iter iter; |
1789 | struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); | ||
1790 | 1926 | ||
1791 | /* | 1927 | /* |
1792 | * bset_search() returns the first key that is strictly greater | 1928 | * bset_search() returns the first key that is strictly greater |
1793 | * than the search key - but for back merging, we want to find | 1929 | * than the search key - but for back merging, we want to find |
1794 | * the first key that is greater than or equal to KEY_START(k) - | 1930 | * the previous key. |
1795 | * unless KEY_START(k) is 0. | ||
1796 | */ | 1931 | */ |
1797 | if (KEY_OFFSET(&search)) | ||
1798 | SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); | ||
1799 | |||
1800 | prev = NULL; | 1932 | prev = NULL; |
1801 | m = bch_btree_iter_init(b, &iter, &search); | 1933 | m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k))); |
1802 | 1934 | ||
1803 | if (fix_overlapping_extents(b, k, &iter, op)) | 1935 | if (fix_overlapping_extents(b, k, &iter, replace_key)) { |
1936 | op->insert_collision = true; | ||
1804 | return false; | 1937 | return false; |
1938 | } | ||
1939 | |||
1940 | if (KEY_DIRTY(k)) | ||
1941 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | ||
1942 | KEY_START(k), KEY_SIZE(k)); | ||
1805 | 1943 | ||
1806 | while (m != end(i) && | 1944 | while (m != end(i) && |
1807 | bkey_cmp(k, &START_KEY(m)) > 0) | 1945 | bkey_cmp(k, &START_KEY(m)) > 0) |
@@ -1825,84 +1963,80 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, | |||
1825 | if (m != end(i) && | 1963 | if (m != end(i) && |
1826 | bch_bkey_try_merge(b, k, m)) | 1964 | bch_bkey_try_merge(b, k, m)) |
1827 | goto copy; | 1965 | goto copy; |
1828 | } else | 1966 | } else { |
1967 | BUG_ON(replace_key); | ||
1829 | m = bch_bset_search(b, &b->sets[b->nsets], k); | 1968 | m = bch_bset_search(b, &b->sets[b->nsets], k); |
1969 | } | ||
1830 | 1970 | ||
1831 | insert: shift_keys(b, m, k); | 1971 | insert: shift_keys(b, m, k); |
1832 | copy: bkey_copy(m, k); | 1972 | copy: bkey_copy(m, k); |
1833 | merged: | 1973 | merged: |
1834 | if (KEY_DIRTY(k)) | 1974 | bch_check_keys(b, "%u for %s", status, |
1835 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | 1975 | replace_key ? "replace" : "insert"); |
1836 | KEY_START(k), KEY_SIZE(k)); | ||
1837 | |||
1838 | bch_check_keys(b, "%u for %s", status, op_type(op)); | ||
1839 | 1976 | ||
1840 | if (b->level && !KEY_OFFSET(k)) | 1977 | if (b->level && !KEY_OFFSET(k)) |
1841 | btree_current_write(b)->prio_blocked++; | 1978 | btree_current_write(b)->prio_blocked++; |
1842 | 1979 | ||
1843 | trace_bcache_btree_insert_key(b, k, op->type, status); | 1980 | trace_bcache_btree_insert_key(b, k, replace_key != NULL, status); |
1844 | 1981 | ||
1845 | return true; | 1982 | return true; |
1846 | } | 1983 | } |
1847 | 1984 | ||
1848 | static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) | 1985 | static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, |
1986 | struct keylist *insert_keys, | ||
1987 | struct bkey *replace_key) | ||
1849 | { | 1988 | { |
1850 | bool ret = false; | 1989 | bool ret = false; |
1851 | struct bkey *k; | 1990 | int oldsize = bch_count_data(b); |
1852 | unsigned oldsize = bch_count_data(b); | ||
1853 | |||
1854 | while ((k = bch_keylist_pop(&op->keys))) { | ||
1855 | bkey_put(b->c, k, b->level); | ||
1856 | ret |= btree_insert_key(b, op, k); | ||
1857 | } | ||
1858 | |||
1859 | BUG_ON(bch_count_data(b) < oldsize); | ||
1860 | return ret; | ||
1861 | } | ||
1862 | 1991 | ||
1863 | bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, | 1992 | while (!bch_keylist_empty(insert_keys)) { |
1864 | struct bio *bio) | 1993 | struct bset *i = write_block(b); |
1865 | { | 1994 | struct bkey *k = insert_keys->keys; |
1866 | bool ret = false; | ||
1867 | uint64_t btree_ptr = b->key.ptr[0]; | ||
1868 | unsigned long seq = b->seq; | ||
1869 | BKEY_PADDED(k) tmp; | ||
1870 | 1995 | ||
1871 | rw_unlock(false, b); | 1996 | if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c) |
1872 | rw_lock(true, b, b->level); | 1997 | > btree_blocks(b)) |
1998 | break; | ||
1873 | 1999 | ||
1874 | if (b->key.ptr[0] != btree_ptr || | 2000 | if (bkey_cmp(k, &b->key) <= 0) { |
1875 | b->seq != seq + 1 || | 2001 | if (!b->level) |
1876 | should_split(b)) | 2002 | bkey_put(b->c, k); |
1877 | goto out; | ||
1878 | 2003 | ||
1879 | op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); | 2004 | ret |= btree_insert_key(b, op, k, replace_key); |
2005 | bch_keylist_pop_front(insert_keys); | ||
2006 | } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { | ||
2007 | BKEY_PADDED(key) temp; | ||
2008 | bkey_copy(&temp.key, insert_keys->keys); | ||
1880 | 2009 | ||
1881 | SET_KEY_PTRS(&op->replace, 1); | 2010 | bch_cut_back(&b->key, &temp.key); |
1882 | get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); | 2011 | bch_cut_front(&b->key, insert_keys->keys); |
1883 | 2012 | ||
1884 | SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); | 2013 | ret |= btree_insert_key(b, op, &temp.key, replace_key); |
2014 | break; | ||
2015 | } else { | ||
2016 | break; | ||
2017 | } | ||
2018 | } | ||
1885 | 2019 | ||
1886 | bkey_copy(&tmp.k, &op->replace); | 2020 | BUG_ON(!bch_keylist_empty(insert_keys) && b->level); |
1887 | 2021 | ||
1888 | BUG_ON(op->type != BTREE_INSERT); | 2022 | BUG_ON(bch_count_data(b) < oldsize); |
1889 | BUG_ON(!btree_insert_key(b, op, &tmp.k)); | ||
1890 | ret = true; | ||
1891 | out: | ||
1892 | downgrade_write(&b->lock); | ||
1893 | return ret; | 2023 | return ret; |
1894 | } | 2024 | } |
1895 | 2025 | ||
1896 | static int btree_split(struct btree *b, struct btree_op *op) | 2026 | static int btree_split(struct btree *b, struct btree_op *op, |
2027 | struct keylist *insert_keys, | ||
2028 | struct bkey *replace_key) | ||
1897 | { | 2029 | { |
1898 | bool split, root = b == b->c->root; | 2030 | bool split; |
1899 | struct btree *n1, *n2 = NULL, *n3 = NULL; | 2031 | struct btree *n1, *n2 = NULL, *n3 = NULL; |
1900 | uint64_t start_time = local_clock(); | 2032 | uint64_t start_time = local_clock(); |
2033 | struct closure cl; | ||
2034 | struct keylist parent_keys; | ||
1901 | 2035 | ||
1902 | if (b->level) | 2036 | closure_init_stack(&cl); |
1903 | set_closure_blocking(&op->cl); | 2037 | bch_keylist_init(&parent_keys); |
1904 | 2038 | ||
1905 | n1 = btree_node_alloc_replacement(b, &op->cl); | 2039 | n1 = btree_node_alloc_replacement(b, true); |
1906 | if (IS_ERR(n1)) | 2040 | if (IS_ERR(n1)) |
1907 | goto err; | 2041 | goto err; |
1908 | 2042 | ||
@@ -1913,19 +2047,20 @@ static int btree_split(struct btree *b, struct btree_op *op) | |||
1913 | 2047 | ||
1914 | trace_bcache_btree_node_split(b, n1->sets[0].data->keys); | 2048 | trace_bcache_btree_node_split(b, n1->sets[0].data->keys); |
1915 | 2049 | ||
1916 | n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); | 2050 | n2 = bch_btree_node_alloc(b->c, b->level, true); |
1917 | if (IS_ERR(n2)) | 2051 | if (IS_ERR(n2)) |
1918 | goto err_free1; | 2052 | goto err_free1; |
1919 | 2053 | ||
1920 | if (root) { | 2054 | if (!b->parent) { |
1921 | n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); | 2055 | n3 = bch_btree_node_alloc(b->c, b->level + 1, true); |
1922 | if (IS_ERR(n3)) | 2056 | if (IS_ERR(n3)) |
1923 | goto err_free2; | 2057 | goto err_free2; |
1924 | } | 2058 | } |
1925 | 2059 | ||
1926 | bch_btree_insert_keys(n1, op); | 2060 | bch_btree_insert_keys(n1, op, insert_keys, replace_key); |
1927 | 2061 | ||
1928 | /* Has to be a linear search because we don't have an auxiliary | 2062 | /* |
2063 | * Has to be a linear search because we don't have an auxiliary | ||
1929 | * search tree yet | 2064 | * search tree yet |
1930 | */ | 2065 | */ |
1931 | 2066 | ||
@@ -1944,60 +2079,57 @@ static int btree_split(struct btree *b, struct btree_op *op) | |||
1944 | 2079 | ||
1945 | bkey_copy_key(&n2->key, &b->key); | 2080 | bkey_copy_key(&n2->key, &b->key); |
1946 | 2081 | ||
1947 | bch_keylist_add(&op->keys, &n2->key); | 2082 | bch_keylist_add(&parent_keys, &n2->key); |
1948 | bch_btree_node_write(n2, &op->cl); | 2083 | bch_btree_node_write(n2, &cl); |
1949 | rw_unlock(true, n2); | 2084 | rw_unlock(true, n2); |
1950 | } else { | 2085 | } else { |
1951 | trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); | 2086 | trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); |
1952 | 2087 | ||
1953 | bch_btree_insert_keys(n1, op); | 2088 | bch_btree_insert_keys(n1, op, insert_keys, replace_key); |
1954 | } | 2089 | } |
1955 | 2090 | ||
1956 | bch_keylist_add(&op->keys, &n1->key); | 2091 | bch_keylist_add(&parent_keys, &n1->key); |
1957 | bch_btree_node_write(n1, &op->cl); | 2092 | bch_btree_node_write(n1, &cl); |
1958 | 2093 | ||
1959 | if (n3) { | 2094 | if (n3) { |
2095 | /* Depth increases, make a new root */ | ||
1960 | bkey_copy_key(&n3->key, &MAX_KEY); | 2096 | bkey_copy_key(&n3->key, &MAX_KEY); |
1961 | bch_btree_insert_keys(n3, op); | 2097 | bch_btree_insert_keys(n3, op, &parent_keys, NULL); |
1962 | bch_btree_node_write(n3, &op->cl); | 2098 | bch_btree_node_write(n3, &cl); |
1963 | 2099 | ||
1964 | closure_sync(&op->cl); | 2100 | closure_sync(&cl); |
1965 | bch_btree_set_root(n3); | 2101 | bch_btree_set_root(n3); |
1966 | rw_unlock(true, n3); | 2102 | rw_unlock(true, n3); |
1967 | } else if (root) { | ||
1968 | op->keys.top = op->keys.bottom; | ||
1969 | closure_sync(&op->cl); | ||
1970 | bch_btree_set_root(n1); | ||
1971 | } else { | ||
1972 | unsigned i; | ||
1973 | 2103 | ||
1974 | bkey_copy(op->keys.top, &b->key); | 2104 | btree_node_free(b); |
1975 | bkey_copy_key(op->keys.top, &ZERO_KEY); | 2105 | } else if (!b->parent) { |
2106 | /* Root filled up but didn't need to be split */ | ||
2107 | closure_sync(&cl); | ||
2108 | bch_btree_set_root(n1); | ||
1976 | 2109 | ||
1977 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | 2110 | btree_node_free(b); |
1978 | uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; | 2111 | } else { |
2112 | /* Split a non root node */ | ||
2113 | closure_sync(&cl); | ||
2114 | make_btree_freeing_key(b, parent_keys.top); | ||
2115 | bch_keylist_push(&parent_keys); | ||
1979 | 2116 | ||
1980 | SET_PTR_GEN(op->keys.top, i, g); | 2117 | btree_node_free(b); |
1981 | } | ||
1982 | 2118 | ||
1983 | bch_keylist_push(&op->keys); | 2119 | bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); |
1984 | closure_sync(&op->cl); | 2120 | BUG_ON(!bch_keylist_empty(&parent_keys)); |
1985 | atomic_inc(&b->c->prio_blocked); | ||
1986 | } | 2121 | } |
1987 | 2122 | ||
1988 | rw_unlock(true, n1); | 2123 | rw_unlock(true, n1); |
1989 | btree_node_free(b, op); | ||
1990 | 2124 | ||
1991 | bch_time_stats_update(&b->c->btree_split_time, start_time); | 2125 | bch_time_stats_update(&b->c->btree_split_time, start_time); |
1992 | 2126 | ||
1993 | return 0; | 2127 | return 0; |
1994 | err_free2: | 2128 | err_free2: |
1995 | __bkey_put(n2->c, &n2->key); | 2129 | btree_node_free(n2); |
1996 | btree_node_free(n2, op); | ||
1997 | rw_unlock(true, n2); | 2130 | rw_unlock(true, n2); |
1998 | err_free1: | 2131 | err_free1: |
1999 | __bkey_put(n1->c, &n1->key); | 2132 | btree_node_free(n1); |
2000 | btree_node_free(n1, op); | ||
2001 | rw_unlock(true, n1); | 2133 | rw_unlock(true, n1); |
2002 | err: | 2134 | err: |
2003 | if (n3 == ERR_PTR(-EAGAIN) || | 2135 | if (n3 == ERR_PTR(-EAGAIN) || |
@@ -2009,116 +2141,126 @@ err: | |||
2009 | return -ENOMEM; | 2141 | return -ENOMEM; |
2010 | } | 2142 | } |
2011 | 2143 | ||
2012 | static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, | 2144 | static int bch_btree_insert_node(struct btree *b, struct btree_op *op, |
2013 | struct keylist *stack_keys) | 2145 | struct keylist *insert_keys, |
2146 | atomic_t *journal_ref, | ||
2147 | struct bkey *replace_key) | ||
2014 | { | 2148 | { |
2015 | if (b->level) { | 2149 | BUG_ON(b->level && replace_key); |
2016 | int ret; | ||
2017 | struct bkey *insert = op->keys.bottom; | ||
2018 | struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); | ||
2019 | |||
2020 | if (!k) { | ||
2021 | btree_bug(b, "no key to recurse on at level %i/%i", | ||
2022 | b->level, b->c->root->level); | ||
2023 | 2150 | ||
2024 | op->keys.top = op->keys.bottom; | 2151 | if (should_split(b)) { |
2025 | return -EIO; | 2152 | if (current->bio_list) { |
2153 | op->lock = b->c->root->level + 1; | ||
2154 | return -EAGAIN; | ||
2155 | } else if (op->lock <= b->c->root->level) { | ||
2156 | op->lock = b->c->root->level + 1; | ||
2157 | return -EINTR; | ||
2158 | } else { | ||
2159 | /* Invalidated all iterators */ | ||
2160 | return btree_split(b, op, insert_keys, replace_key) ?: | ||
2161 | -EINTR; | ||
2026 | } | 2162 | } |
2163 | } else { | ||
2164 | BUG_ON(write_block(b) != b->sets[b->nsets].data); | ||
2027 | 2165 | ||
2028 | if (bkey_cmp(insert, k) > 0) { | 2166 | if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { |
2029 | unsigned i; | 2167 | if (!b->level) |
2030 | 2168 | bch_btree_leaf_dirty(b, journal_ref); | |
2031 | if (op->type == BTREE_REPLACE) { | 2169 | else |
2032 | __bkey_put(b->c, insert); | 2170 | bch_btree_node_write_sync(b); |
2033 | op->keys.top = op->keys.bottom; | 2171 | } |
2034 | op->insert_collision = true; | ||
2035 | return 0; | ||
2036 | } | ||
2037 | 2172 | ||
2038 | for (i = 0; i < KEY_PTRS(insert); i++) | 2173 | return 0; |
2039 | atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); | 2174 | } |
2175 | } | ||
2040 | 2176 | ||
2041 | bkey_copy(stack_keys->top, insert); | 2177 | int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, |
2178 | struct bkey *check_key) | ||
2179 | { | ||
2180 | int ret = -EINTR; | ||
2181 | uint64_t btree_ptr = b->key.ptr[0]; | ||
2182 | unsigned long seq = b->seq; | ||
2183 | struct keylist insert; | ||
2184 | bool upgrade = op->lock == -1; | ||
2042 | 2185 | ||
2043 | bch_cut_back(k, insert); | 2186 | bch_keylist_init(&insert); |
2044 | bch_cut_front(k, stack_keys->top); | ||
2045 | 2187 | ||
2046 | bch_keylist_push(stack_keys); | 2188 | if (upgrade) { |
2047 | } | 2189 | rw_unlock(false, b); |
2190 | rw_lock(true, b, b->level); | ||
2048 | 2191 | ||
2049 | ret = btree(insert_recurse, k, b, op, stack_keys); | 2192 | if (b->key.ptr[0] != btree_ptr || |
2050 | if (ret) | 2193 | b->seq != seq + 1) |
2051 | return ret; | 2194 | goto out; |
2052 | } | 2195 | } |
2053 | 2196 | ||
2054 | if (!bch_keylist_empty(&op->keys)) { | 2197 | SET_KEY_PTRS(check_key, 1); |
2055 | if (should_split(b)) { | 2198 | get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); |
2056 | if (op->lock <= b->c->root->level) { | ||
2057 | BUG_ON(b->level); | ||
2058 | op->lock = b->c->root->level + 1; | ||
2059 | return -EINTR; | ||
2060 | } | ||
2061 | return btree_split(b, op); | ||
2062 | } | ||
2063 | 2199 | ||
2064 | BUG_ON(write_block(b) != b->sets[b->nsets].data); | 2200 | SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); |
2065 | 2201 | ||
2066 | if (bch_btree_insert_keys(b, op)) { | 2202 | bch_keylist_add(&insert, check_key); |
2067 | if (!b->level) | ||
2068 | bch_btree_leaf_dirty(b, op); | ||
2069 | else | ||
2070 | bch_btree_node_write(b, &op->cl); | ||
2071 | } | ||
2072 | } | ||
2073 | 2203 | ||
2074 | return 0; | 2204 | ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); |
2205 | |||
2206 | BUG_ON(!ret && !bch_keylist_empty(&insert)); | ||
2207 | out: | ||
2208 | if (upgrade) | ||
2209 | downgrade_write(&b->lock); | ||
2210 | return ret; | ||
2075 | } | 2211 | } |
2076 | 2212 | ||
2077 | int bch_btree_insert(struct btree_op *op, struct cache_set *c) | 2213 | struct btree_insert_op { |
2214 | struct btree_op op; | ||
2215 | struct keylist *keys; | ||
2216 | atomic_t *journal_ref; | ||
2217 | struct bkey *replace_key; | ||
2218 | }; | ||
2219 | |||
2220 | int btree_insert_fn(struct btree_op *b_op, struct btree *b) | ||
2078 | { | 2221 | { |
2079 | int ret = 0; | 2222 | struct btree_insert_op *op = container_of(b_op, |
2080 | struct keylist stack_keys; | 2223 | struct btree_insert_op, op); |
2081 | 2224 | ||
2082 | /* | 2225 | int ret = bch_btree_insert_node(b, &op->op, op->keys, |
2083 | * Don't want to block with the btree locked unless we have to, | 2226 | op->journal_ref, op->replace_key); |
2084 | * otherwise we get deadlocks with try_harder and between split/gc | 2227 | if (ret && !bch_keylist_empty(op->keys)) |
2085 | */ | 2228 | return ret; |
2086 | clear_closure_blocking(&op->cl); | 2229 | else |
2087 | 2230 | return MAP_DONE; | |
2088 | BUG_ON(bch_keylist_empty(&op->keys)); | 2231 | } |
2089 | bch_keylist_copy(&stack_keys, &op->keys); | ||
2090 | bch_keylist_init(&op->keys); | ||
2091 | |||
2092 | while (!bch_keylist_empty(&stack_keys) || | ||
2093 | !bch_keylist_empty(&op->keys)) { | ||
2094 | if (bch_keylist_empty(&op->keys)) { | ||
2095 | bch_keylist_add(&op->keys, | ||
2096 | bch_keylist_pop(&stack_keys)); | ||
2097 | op->lock = 0; | ||
2098 | } | ||
2099 | 2232 | ||
2100 | ret = btree_root(insert_recurse, c, op, &stack_keys); | 2233 | int bch_btree_insert(struct cache_set *c, struct keylist *keys, |
2234 | atomic_t *journal_ref, struct bkey *replace_key) | ||
2235 | { | ||
2236 | struct btree_insert_op op; | ||
2237 | int ret = 0; | ||
2101 | 2238 | ||
2102 | if (ret == -EAGAIN) { | 2239 | BUG_ON(current->bio_list); |
2103 | ret = 0; | 2240 | BUG_ON(bch_keylist_empty(keys)); |
2104 | closure_sync(&op->cl); | 2241 | |
2105 | } else if (ret) { | 2242 | bch_btree_op_init(&op.op, 0); |
2106 | struct bkey *k; | 2243 | op.keys = keys; |
2244 | op.journal_ref = journal_ref; | ||
2245 | op.replace_key = replace_key; | ||
2246 | |||
2247 | while (!ret && !bch_keylist_empty(keys)) { | ||
2248 | op.op.lock = 0; | ||
2249 | ret = bch_btree_map_leaf_nodes(&op.op, c, | ||
2250 | &START_KEY(keys->keys), | ||
2251 | btree_insert_fn); | ||
2252 | } | ||
2107 | 2253 | ||
2108 | pr_err("error %i trying to insert key for %s", | 2254 | if (ret) { |
2109 | ret, op_type(op)); | 2255 | struct bkey *k; |
2110 | 2256 | ||
2111 | while ((k = bch_keylist_pop(&stack_keys) ?: | 2257 | pr_err("error %i", ret); |
2112 | bch_keylist_pop(&op->keys))) | ||
2113 | bkey_put(c, k, 0); | ||
2114 | } | ||
2115 | } | ||
2116 | 2258 | ||
2117 | bch_keylist_free(&stack_keys); | 2259 | while ((k = bch_keylist_pop(keys))) |
2260 | bkey_put(c, k); | ||
2261 | } else if (op.op.insert_collision) | ||
2262 | ret = -ESRCH; | ||
2118 | 2263 | ||
2119 | if (op->journal) | ||
2120 | atomic_dec_bug(op->journal); | ||
2121 | op->journal = NULL; | ||
2122 | return ret; | 2264 | return ret; |
2123 | } | 2265 | } |
2124 | 2266 | ||
@@ -2141,132 +2283,81 @@ void bch_btree_set_root(struct btree *b) | |||
2141 | mutex_unlock(&b->c->bucket_lock); | 2283 | mutex_unlock(&b->c->bucket_lock); |
2142 | 2284 | ||
2143 | b->c->root = b; | 2285 | b->c->root = b; |
2144 | __bkey_put(b->c, &b->key); | ||
2145 | 2286 | ||
2146 | bch_journal_meta(b->c, &cl); | 2287 | bch_journal_meta(b->c, &cl); |
2147 | closure_sync(&cl); | 2288 | closure_sync(&cl); |
2148 | } | 2289 | } |
2149 | 2290 | ||
2150 | /* Cache lookup */ | 2291 | /* Map across nodes or keys */ |
2151 | 2292 | ||
2152 | static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, | 2293 | static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, |
2153 | struct bkey *k) | 2294 | struct bkey *from, |
2295 | btree_map_nodes_fn *fn, int flags) | ||
2154 | { | 2296 | { |
2155 | struct search *s = container_of(op, struct search, op); | 2297 | int ret = MAP_CONTINUE; |
2156 | struct bio *bio = &s->bio.bio; | 2298 | |
2157 | int ret = 0; | 2299 | if (b->level) { |
2300 | struct bkey *k; | ||
2301 | struct btree_iter iter; | ||
2158 | 2302 | ||
2159 | while (!ret && | 2303 | bch_btree_iter_init(b, &iter, from); |
2160 | !op->lookup_done) { | ||
2161 | unsigned sectors = INT_MAX; | ||
2162 | 2304 | ||
2163 | if (KEY_INODE(k) == op->inode) { | 2305 | while ((k = bch_btree_iter_next_filter(&iter, b, |
2164 | if (KEY_START(k) <= bio->bi_sector) | 2306 | bch_ptr_bad))) { |
2165 | break; | 2307 | ret = btree(map_nodes_recurse, k, b, |
2308 | op, from, fn, flags); | ||
2309 | from = NULL; | ||
2166 | 2310 | ||
2167 | sectors = min_t(uint64_t, sectors, | 2311 | if (ret != MAP_CONTINUE) |
2168 | KEY_START(k) - bio->bi_sector); | 2312 | return ret; |
2169 | } | 2313 | } |
2170 | |||
2171 | ret = s->d->cache_miss(b, s, bio, sectors); | ||
2172 | } | 2314 | } |
2173 | 2315 | ||
2316 | if (!b->level || flags == MAP_ALL_NODES) | ||
2317 | ret = fn(op, b); | ||
2318 | |||
2174 | return ret; | 2319 | return ret; |
2175 | } | 2320 | } |
2176 | 2321 | ||
2177 | /* | 2322 | int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, |
2178 | * Read from a single key, handling the initial cache miss if the key starts in | 2323 | struct bkey *from, btree_map_nodes_fn *fn, int flags) |
2179 | * the middle of the bio | ||
2180 | */ | ||
2181 | static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, | ||
2182 | struct bkey *k) | ||
2183 | { | 2324 | { |
2184 | struct search *s = container_of(op, struct search, op); | 2325 | return btree_root(map_nodes_recurse, c, op, from, fn, flags); |
2185 | struct bio *bio = &s->bio.bio; | ||
2186 | unsigned ptr; | ||
2187 | struct bio *n; | ||
2188 | |||
2189 | int ret = submit_partial_cache_miss(b, op, k); | ||
2190 | if (ret || op->lookup_done) | ||
2191 | return ret; | ||
2192 | |||
2193 | /* XXX: figure out best pointer - for multiple cache devices */ | ||
2194 | ptr = 0; | ||
2195 | |||
2196 | PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; | ||
2197 | |||
2198 | while (!op->lookup_done && | ||
2199 | KEY_INODE(k) == op->inode && | ||
2200 | bio->bi_sector < KEY_OFFSET(k)) { | ||
2201 | struct bkey *bio_key; | ||
2202 | sector_t sector = PTR_OFFSET(k, ptr) + | ||
2203 | (bio->bi_sector - KEY_START(k)); | ||
2204 | unsigned sectors = min_t(uint64_t, INT_MAX, | ||
2205 | KEY_OFFSET(k) - bio->bi_sector); | ||
2206 | |||
2207 | n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | ||
2208 | if (n == bio) | ||
2209 | op->lookup_done = true; | ||
2210 | |||
2211 | bio_key = &container_of(n, struct bbio, bio)->key; | ||
2212 | |||
2213 | /* | ||
2214 | * The bucket we're reading from might be reused while our bio | ||
2215 | * is in flight, and we could then end up reading the wrong | ||
2216 | * data. | ||
2217 | * | ||
2218 | * We guard against this by checking (in cache_read_endio()) if | ||
2219 | * the pointer is stale again; if so, we treat it as an error | ||
2220 | * and reread from the backing device (but we don't pass that | ||
2221 | * error up anywhere). | ||
2222 | */ | ||
2223 | |||
2224 | bch_bkey_copy_single_ptr(bio_key, k, ptr); | ||
2225 | SET_PTR_OFFSET(bio_key, 0, sector); | ||
2226 | |||
2227 | n->bi_end_io = bch_cache_read_endio; | ||
2228 | n->bi_private = &s->cl; | ||
2229 | |||
2230 | __bch_submit_bbio(n, b->c); | ||
2231 | } | ||
2232 | |||
2233 | return 0; | ||
2234 | } | 2326 | } |
2235 | 2327 | ||
2236 | int bch_btree_search_recurse(struct btree *b, struct btree_op *op) | 2328 | static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, |
2329 | struct bkey *from, btree_map_keys_fn *fn, | ||
2330 | int flags) | ||
2237 | { | 2331 | { |
2238 | struct search *s = container_of(op, struct search, op); | 2332 | int ret = MAP_CONTINUE; |
2239 | struct bio *bio = &s->bio.bio; | ||
2240 | |||
2241 | int ret = 0; | ||
2242 | struct bkey *k; | 2333 | struct bkey *k; |
2243 | struct btree_iter iter; | 2334 | struct btree_iter iter; |
2244 | bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); | ||
2245 | 2335 | ||
2246 | do { | 2336 | bch_btree_iter_init(b, &iter, from); |
2247 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | ||
2248 | if (!k) { | ||
2249 | /* | ||
2250 | * b->key would be exactly what we want, except that | ||
2251 | * pointers to btree nodes have nonzero size - we | ||
2252 | * wouldn't go far enough | ||
2253 | */ | ||
2254 | 2337 | ||
2255 | ret = submit_partial_cache_miss(b, op, | 2338 | while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) { |
2256 | &KEY(KEY_INODE(&b->key), | 2339 | ret = !b->level |
2257 | KEY_OFFSET(&b->key), 0)); | 2340 | ? fn(op, b, k) |
2258 | break; | 2341 | : btree(map_keys_recurse, k, b, op, from, fn, flags); |
2259 | } | 2342 | from = NULL; |
2343 | |||
2344 | if (ret != MAP_CONTINUE) | ||
2345 | return ret; | ||
2346 | } | ||
2260 | 2347 | ||
2261 | ret = b->level | 2348 | if (!b->level && (flags & MAP_END_KEY)) |
2262 | ? btree(search_recurse, k, b, op) | 2349 | ret = fn(op, b, &KEY(KEY_INODE(&b->key), |
2263 | : submit_partial_cache_hit(b, op, k); | 2350 | KEY_OFFSET(&b->key), 0)); |
2264 | } while (!ret && | ||
2265 | !op->lookup_done); | ||
2266 | 2351 | ||
2267 | return ret; | 2352 | return ret; |
2268 | } | 2353 | } |
2269 | 2354 | ||
2355 | int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, | ||
2356 | struct bkey *from, btree_map_keys_fn *fn, int flags) | ||
2357 | { | ||
2358 | return btree_root(map_keys_recurse, c, op, from, fn, flags); | ||
2359 | } | ||
2360 | |||
2270 | /* Keybuf code */ | 2361 | /* Keybuf code */ |
2271 | 2362 | ||
2272 | static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) | 2363 | static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) |
@@ -2285,80 +2376,79 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, | |||
2285 | return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); | 2376 | return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); |
2286 | } | 2377 | } |
2287 | 2378 | ||
2288 | static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | 2379 | struct refill { |
2289 | struct keybuf *buf, struct bkey *end, | 2380 | struct btree_op op; |
2290 | keybuf_pred_fn *pred) | 2381 | unsigned nr_found; |
2291 | { | 2382 | struct keybuf *buf; |
2292 | struct btree_iter iter; | 2383 | struct bkey *end; |
2293 | bch_btree_iter_init(b, &iter, &buf->last_scanned); | 2384 | keybuf_pred_fn *pred; |
2294 | 2385 | }; | |
2295 | while (!array_freelist_empty(&buf->freelist)) { | ||
2296 | struct bkey *k = bch_btree_iter_next_filter(&iter, b, | ||
2297 | bch_ptr_bad); | ||
2298 | |||
2299 | if (!b->level) { | ||
2300 | if (!k) { | ||
2301 | buf->last_scanned = b->key; | ||
2302 | break; | ||
2303 | } | ||
2304 | 2386 | ||
2305 | buf->last_scanned = *k; | 2387 | static int refill_keybuf_fn(struct btree_op *op, struct btree *b, |
2306 | if (bkey_cmp(&buf->last_scanned, end) >= 0) | 2388 | struct bkey *k) |
2307 | break; | 2389 | { |
2390 | struct refill *refill = container_of(op, struct refill, op); | ||
2391 | struct keybuf *buf = refill->buf; | ||
2392 | int ret = MAP_CONTINUE; | ||
2308 | 2393 | ||
2309 | if (pred(buf, k)) { | 2394 | if (bkey_cmp(k, refill->end) >= 0) { |
2310 | struct keybuf_key *w; | 2395 | ret = MAP_DONE; |
2396 | goto out; | ||
2397 | } | ||
2311 | 2398 | ||
2312 | spin_lock(&buf->lock); | 2399 | if (!KEY_SIZE(k)) /* end key */ |
2400 | goto out; | ||
2313 | 2401 | ||
2314 | w = array_alloc(&buf->freelist); | 2402 | if (refill->pred(buf, k)) { |
2403 | struct keybuf_key *w; | ||
2315 | 2404 | ||
2316 | w->private = NULL; | 2405 | spin_lock(&buf->lock); |
2317 | bkey_copy(&w->key, k); | ||
2318 | 2406 | ||
2319 | if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) | 2407 | w = array_alloc(&buf->freelist); |
2320 | array_free(&buf->freelist, w); | 2408 | if (!w) { |
2409 | spin_unlock(&buf->lock); | ||
2410 | return MAP_DONE; | ||
2411 | } | ||
2321 | 2412 | ||
2322 | spin_unlock(&buf->lock); | 2413 | w->private = NULL; |
2323 | } | 2414 | bkey_copy(&w->key, k); |
2324 | } else { | ||
2325 | if (!k) | ||
2326 | break; | ||
2327 | 2415 | ||
2328 | btree(refill_keybuf, k, b, op, buf, end, pred); | 2416 | if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) |
2329 | /* | 2417 | array_free(&buf->freelist, w); |
2330 | * Might get an error here, but can't really do anything | 2418 | else |
2331 | * and it'll get logged elsewhere. Just read what we | 2419 | refill->nr_found++; |
2332 | * can. | ||
2333 | */ | ||
2334 | 2420 | ||
2335 | if (bkey_cmp(&buf->last_scanned, end) >= 0) | 2421 | if (array_freelist_empty(&buf->freelist)) |
2336 | break; | 2422 | ret = MAP_DONE; |
2337 | 2423 | ||
2338 | cond_resched(); | 2424 | spin_unlock(&buf->lock); |
2339 | } | ||
2340 | } | 2425 | } |
2341 | 2426 | out: | |
2342 | return 0; | 2427 | buf->last_scanned = *k; |
2428 | return ret; | ||
2343 | } | 2429 | } |
2344 | 2430 | ||
2345 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, | 2431 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, |
2346 | struct bkey *end, keybuf_pred_fn *pred) | 2432 | struct bkey *end, keybuf_pred_fn *pred) |
2347 | { | 2433 | { |
2348 | struct bkey start = buf->last_scanned; | 2434 | struct bkey start = buf->last_scanned; |
2349 | struct btree_op op; | 2435 | struct refill refill; |
2350 | bch_btree_op_init_stack(&op); | ||
2351 | 2436 | ||
2352 | cond_resched(); | 2437 | cond_resched(); |
2353 | 2438 | ||
2354 | btree_root(refill_keybuf, c, &op, buf, end, pred); | 2439 | bch_btree_op_init(&refill.op, -1); |
2355 | closure_sync(&op.cl); | 2440 | refill.nr_found = 0; |
2441 | refill.buf = buf; | ||
2442 | refill.end = end; | ||
2443 | refill.pred = pred; | ||
2444 | |||
2445 | bch_btree_map_keys(&refill.op, c, &buf->last_scanned, | ||
2446 | refill_keybuf_fn, MAP_END_KEY); | ||
2356 | 2447 | ||
2357 | pr_debug("found %s keys from %llu:%llu to %llu:%llu", | 2448 | trace_bcache_keyscan(refill.nr_found, |
2358 | RB_EMPTY_ROOT(&buf->keys) ? "no" : | 2449 | KEY_INODE(&start), KEY_OFFSET(&start), |
2359 | array_freelist_empty(&buf->freelist) ? "some" : "a few", | 2450 | KEY_INODE(&buf->last_scanned), |
2360 | KEY_INODE(&start), KEY_OFFSET(&start), | 2451 | KEY_OFFSET(&buf->last_scanned)); |
2361 | KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); | ||
2362 | 2452 | ||
2363 | spin_lock(&buf->lock); | 2453 | spin_lock(&buf->lock); |
2364 | 2454 | ||
@@ -2436,9 +2526,9 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf) | |||
2436 | } | 2526 | } |
2437 | 2527 | ||
2438 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, | 2528 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, |
2439 | struct keybuf *buf, | 2529 | struct keybuf *buf, |
2440 | struct bkey *end, | 2530 | struct bkey *end, |
2441 | keybuf_pred_fn *pred) | 2531 | keybuf_pred_fn *pred) |
2442 | { | 2532 | { |
2443 | struct keybuf_key *ret; | 2533 | struct keybuf_key *ret; |
2444 | 2534 | ||
@@ -2471,14 +2561,12 @@ void bch_btree_exit(void) | |||
2471 | { | 2561 | { |
2472 | if (btree_io_wq) | 2562 | if (btree_io_wq) |
2473 | destroy_workqueue(btree_io_wq); | 2563 | destroy_workqueue(btree_io_wq); |
2474 | if (bch_gc_wq) | ||
2475 | destroy_workqueue(bch_gc_wq); | ||
2476 | } | 2564 | } |
2477 | 2565 | ||
2478 | int __init bch_btree_init(void) | 2566 | int __init bch_btree_init(void) |
2479 | { | 2567 | { |
2480 | if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || | 2568 | btree_io_wq = create_singlethread_workqueue("bch_btree_io"); |
2481 | !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) | 2569 | if (!btree_io_wq) |
2482 | return -ENOMEM; | 2570 | return -ENOMEM; |
2483 | 2571 | ||
2484 | return 0; | 2572 | return 0; |
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 3333d3723633..767e75570896 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h | |||
@@ -125,6 +125,7 @@ struct btree { | |||
125 | unsigned long seq; | 125 | unsigned long seq; |
126 | struct rw_semaphore lock; | 126 | struct rw_semaphore lock; |
127 | struct cache_set *c; | 127 | struct cache_set *c; |
128 | struct btree *parent; | ||
128 | 129 | ||
129 | unsigned long flags; | 130 | unsigned long flags; |
130 | uint16_t written; /* would be nice to kill */ | 131 | uint16_t written; /* would be nice to kill */ |
@@ -200,12 +201,7 @@ static inline bool bkey_written(struct btree *b, struct bkey *k) | |||
200 | 201 | ||
201 | static inline void set_gc_sectors(struct cache_set *c) | 202 | static inline void set_gc_sectors(struct cache_set *c) |
202 | { | 203 | { |
203 | atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); | 204 | atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); |
204 | } | ||
205 | |||
206 | static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) | ||
207 | { | ||
208 | return __bch_ptr_invalid(b->c, b->level, k); | ||
209 | } | 205 | } |
210 | 206 | ||
211 | static inline struct bkey *bch_btree_iter_init(struct btree *b, | 207 | static inline struct bkey *bch_btree_iter_init(struct btree *b, |
@@ -215,6 +211,16 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b, | |||
215 | return __bch_btree_iter_init(b, iter, search, b->sets); | 211 | return __bch_btree_iter_init(b, iter, search, b->sets); |
216 | } | 212 | } |
217 | 213 | ||
214 | static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) | ||
215 | { | ||
216 | if (b->level) | ||
217 | return bch_btree_ptr_invalid(b->c, k); | ||
218 | else | ||
219 | return bch_extent_ptr_invalid(b->c, k); | ||
220 | } | ||
221 | |||
222 | void bkey_put(struct cache_set *c, struct bkey *k); | ||
223 | |||
218 | /* Looping macros */ | 224 | /* Looping macros */ |
219 | 225 | ||
220 | #define for_each_cached_btree(b, c, iter) \ | 226 | #define for_each_cached_btree(b, c, iter) \ |
@@ -234,51 +240,17 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b, | |||
234 | /* Recursing down the btree */ | 240 | /* Recursing down the btree */ |
235 | 241 | ||
236 | struct btree_op { | 242 | struct btree_op { |
237 | struct closure cl; | ||
238 | struct cache_set *c; | ||
239 | |||
240 | /* Journal entry we have a refcount on */ | ||
241 | atomic_t *journal; | ||
242 | |||
243 | /* Bio to be inserted into the cache */ | ||
244 | struct bio *cache_bio; | ||
245 | |||
246 | unsigned inode; | ||
247 | |||
248 | uint16_t write_prio; | ||
249 | |||
250 | /* Btree level at which we start taking write locks */ | 243 | /* Btree level at which we start taking write locks */ |
251 | short lock; | 244 | short lock; |
252 | 245 | ||
253 | /* Btree insertion type */ | ||
254 | enum { | ||
255 | BTREE_INSERT, | ||
256 | BTREE_REPLACE | ||
257 | } type:8; | ||
258 | |||
259 | unsigned csum:1; | ||
260 | unsigned skip:1; | ||
261 | unsigned flush_journal:1; | ||
262 | |||
263 | unsigned insert_data_done:1; | ||
264 | unsigned lookup_done:1; | ||
265 | unsigned insert_collision:1; | 246 | unsigned insert_collision:1; |
266 | |||
267 | /* Anything after this point won't get zeroed in do_bio_hook() */ | ||
268 | |||
269 | /* Keys to be inserted */ | ||
270 | struct keylist keys; | ||
271 | BKEY_PADDED(replace); | ||
272 | }; | 247 | }; |
273 | 248 | ||
274 | enum { | 249 | static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) |
275 | BTREE_INSERT_STATUS_INSERT, | 250 | { |
276 | BTREE_INSERT_STATUS_BACK_MERGE, | 251 | memset(op, 0, sizeof(struct btree_op)); |
277 | BTREE_INSERT_STATUS_OVERWROTE, | 252 | op->lock = write_lock_level; |
278 | BTREE_INSERT_STATUS_FRONT_MERGE, | 253 | } |
279 | }; | ||
280 | |||
281 | void bch_btree_op_init_stack(struct btree_op *); | ||
282 | 254 | ||
283 | static inline void rw_lock(bool w, struct btree *b, int level) | 255 | static inline void rw_lock(bool w, struct btree *b, int level) |
284 | { | 256 | { |
@@ -290,108 +262,71 @@ static inline void rw_lock(bool w, struct btree *b, int level) | |||
290 | 262 | ||
291 | static inline void rw_unlock(bool w, struct btree *b) | 263 | static inline void rw_unlock(bool w, struct btree *b) |
292 | { | 264 | { |
293 | #ifdef CONFIG_BCACHE_EDEBUG | ||
294 | unsigned i; | ||
295 | |||
296 | if (w && b->key.ptr[0]) | ||
297 | for (i = 0; i <= b->nsets; i++) | ||
298 | bch_check_key_order(b, b->sets[i].data); | ||
299 | #endif | ||
300 | |||
301 | if (w) | 265 | if (w) |
302 | b->seq++; | 266 | b->seq++; |
303 | (w ? up_write : up_read)(&b->lock); | 267 | (w ? up_write : up_read)(&b->lock); |
304 | } | 268 | } |
305 | 269 | ||
306 | #define insert_lock(s, b) ((b)->level <= (s)->lock) | 270 | void bch_btree_node_read(struct btree *); |
271 | void bch_btree_node_write(struct btree *, struct closure *); | ||
307 | 272 | ||
308 | /* | 273 | void bch_btree_set_root(struct btree *); |
309 | * These macros are for recursing down the btree - they handle the details of | 274 | struct btree *bch_btree_node_alloc(struct cache_set *, int, bool); |
310 | * locking and looking up nodes in the cache for you. They're best treated as | 275 | struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); |
311 | * mere syntax when reading code that uses them. | ||
312 | * | ||
313 | * op->lock determines whether we take a read or a write lock at a given depth. | ||
314 | * If you've got a read lock and find that you need a write lock (i.e. you're | ||
315 | * going to have to split), set op->lock and return -EINTR; btree_root() will | ||
316 | * call you again and you'll have the correct lock. | ||
317 | */ | ||
318 | 276 | ||
319 | /** | 277 | int bch_btree_insert_check_key(struct btree *, struct btree_op *, |
320 | * btree - recurse down the btree on a specified key | 278 | struct bkey *); |
321 | * @fn: function to call, which will be passed the child node | 279 | int bch_btree_insert(struct cache_set *, struct keylist *, |
322 | * @key: key to recurse on | 280 | atomic_t *, struct bkey *); |
323 | * @b: parent btree node | 281 | |
324 | * @op: pointer to struct btree_op | 282 | int bch_gc_thread_start(struct cache_set *); |
325 | */ | 283 | size_t bch_btree_gc_finish(struct cache_set *); |
326 | #define btree(fn, key, b, op, ...) \ | 284 | void bch_moving_gc(struct cache_set *); |
327 | ({ \ | 285 | int bch_btree_check(struct cache_set *); |
328 | int _r, l = (b)->level - 1; \ | 286 | uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); |
329 | bool _w = l <= (op)->lock; \ | ||
330 | struct btree *_b = bch_btree_node_get((b)->c, key, l, op); \ | ||
331 | if (!IS_ERR(_b)) { \ | ||
332 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | ||
333 | rw_unlock(_w, _b); \ | ||
334 | } else \ | ||
335 | _r = PTR_ERR(_b); \ | ||
336 | _r; \ | ||
337 | }) | ||
338 | |||
339 | /** | ||
340 | * btree_root - call a function on the root of the btree | ||
341 | * @fn: function to call, which will be passed the child node | ||
342 | * @c: cache set | ||
343 | * @op: pointer to struct btree_op | ||
344 | */ | ||
345 | #define btree_root(fn, c, op, ...) \ | ||
346 | ({ \ | ||
347 | int _r = -EINTR; \ | ||
348 | do { \ | ||
349 | struct btree *_b = (c)->root; \ | ||
350 | bool _w = insert_lock(op, _b); \ | ||
351 | rw_lock(_w, _b, _b->level); \ | ||
352 | if (_b == (c)->root && \ | ||
353 | _w == insert_lock(op, _b)) \ | ||
354 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | ||
355 | rw_unlock(_w, _b); \ | ||
356 | bch_cannibalize_unlock(c, &(op)->cl); \ | ||
357 | } while (_r == -EINTR); \ | ||
358 | \ | ||
359 | _r; \ | ||
360 | }) | ||
361 | 287 | ||
362 | static inline bool should_split(struct btree *b) | 288 | static inline void wake_up_gc(struct cache_set *c) |
363 | { | 289 | { |
364 | struct bset *i = write_block(b); | 290 | if (c->gc_thread) |
365 | return b->written >= btree_blocks(b) || | 291 | wake_up_process(c->gc_thread); |
366 | (i->seq == b->sets[0].data->seq && | ||
367 | b->written + __set_blocks(i, i->keys + 15, b->c) | ||
368 | > btree_blocks(b)); | ||
369 | } | 292 | } |
370 | 293 | ||
371 | void bch_btree_node_read(struct btree *); | 294 | #define MAP_DONE 0 |
372 | void bch_btree_node_write(struct btree *, struct closure *); | 295 | #define MAP_CONTINUE 1 |
373 | 296 | ||
374 | void bch_cannibalize_unlock(struct cache_set *, struct closure *); | 297 | #define MAP_ALL_NODES 0 |
375 | void bch_btree_set_root(struct btree *); | 298 | #define MAP_LEAF_NODES 1 |
376 | struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); | ||
377 | struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, | ||
378 | int, struct btree_op *); | ||
379 | 299 | ||
380 | bool bch_btree_insert_check_key(struct btree *, struct btree_op *, | 300 | #define MAP_END_KEY 1 |
381 | struct bio *); | ||
382 | int bch_btree_insert(struct btree_op *, struct cache_set *); | ||
383 | 301 | ||
384 | int bch_btree_search_recurse(struct btree *, struct btree_op *); | 302 | typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *); |
303 | int __bch_btree_map_nodes(struct btree_op *, struct cache_set *, | ||
304 | struct bkey *, btree_map_nodes_fn *, int); | ||
385 | 305 | ||
386 | void bch_queue_gc(struct cache_set *); | 306 | static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, |
387 | size_t bch_btree_gc_finish(struct cache_set *); | 307 | struct bkey *from, btree_map_nodes_fn *fn) |
388 | void bch_moving_gc(struct closure *); | 308 | { |
389 | int bch_btree_check(struct cache_set *, struct btree_op *); | 309 | return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES); |
390 | uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); | 310 | } |
311 | |||
312 | static inline int bch_btree_map_leaf_nodes(struct btree_op *op, | ||
313 | struct cache_set *c, | ||
314 | struct bkey *from, | ||
315 | btree_map_nodes_fn *fn) | ||
316 | { | ||
317 | return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES); | ||
318 | } | ||
319 | |||
320 | typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *, | ||
321 | struct bkey *); | ||
322 | int bch_btree_map_keys(struct btree_op *, struct cache_set *, | ||
323 | struct bkey *, btree_map_keys_fn *, int); | ||
324 | |||
325 | typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); | ||
391 | 326 | ||
392 | void bch_keybuf_init(struct keybuf *); | 327 | void bch_keybuf_init(struct keybuf *); |
393 | void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, | 328 | void bch_refill_keybuf(struct cache_set *, struct keybuf *, |
394 | keybuf_pred_fn *); | 329 | struct bkey *, keybuf_pred_fn *); |
395 | bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, | 330 | bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, |
396 | struct bkey *); | 331 | struct bkey *); |
397 | void bch_keybuf_del(struct keybuf *, struct keybuf_key *); | 332 | void bch_keybuf_del(struct keybuf *, struct keybuf_key *); |
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 9aba2017f0d1..dfff2410322e 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c | |||
@@ -11,17 +11,6 @@ | |||
11 | 11 | ||
12 | #include "closure.h" | 12 | #include "closure.h" |
13 | 13 | ||
14 | void closure_queue(struct closure *cl) | ||
15 | { | ||
16 | struct workqueue_struct *wq = cl->wq; | ||
17 | if (wq) { | ||
18 | INIT_WORK(&cl->work, cl->work.func); | ||
19 | BUG_ON(!queue_work(wq, &cl->work)); | ||
20 | } else | ||
21 | cl->fn(cl); | ||
22 | } | ||
23 | EXPORT_SYMBOL_GPL(closure_queue); | ||
24 | |||
25 | #define CL_FIELD(type, field) \ | 14 | #define CL_FIELD(type, field) \ |
26 | case TYPE_ ## type: \ | 15 | case TYPE_ ## type: \ |
27 | return &container_of(cl, struct type, cl)->field | 16 | return &container_of(cl, struct type, cl)->field |
@@ -30,17 +19,6 @@ static struct closure_waitlist *closure_waitlist(struct closure *cl) | |||
30 | { | 19 | { |
31 | switch (cl->type) { | 20 | switch (cl->type) { |
32 | CL_FIELD(closure_with_waitlist, wait); | 21 | CL_FIELD(closure_with_waitlist, wait); |
33 | CL_FIELD(closure_with_waitlist_and_timer, wait); | ||
34 | default: | ||
35 | return NULL; | ||
36 | } | ||
37 | } | ||
38 | |||
39 | static struct timer_list *closure_timer(struct closure *cl) | ||
40 | { | ||
41 | switch (cl->type) { | ||
42 | CL_FIELD(closure_with_timer, timer); | ||
43 | CL_FIELD(closure_with_waitlist_and_timer, timer); | ||
44 | default: | 22 | default: |
45 | return NULL; | 23 | return NULL; |
46 | } | 24 | } |
@@ -51,7 +29,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) | |||
51 | int r = flags & CLOSURE_REMAINING_MASK; | 29 | int r = flags & CLOSURE_REMAINING_MASK; |
52 | 30 | ||
53 | BUG_ON(flags & CLOSURE_GUARD_MASK); | 31 | BUG_ON(flags & CLOSURE_GUARD_MASK); |
54 | BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING))); | 32 | BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); |
55 | 33 | ||
56 | /* Must deliver precisely one wakeup */ | 34 | /* Must deliver precisely one wakeup */ |
57 | if (r == 1 && (flags & CLOSURE_SLEEPING)) | 35 | if (r == 1 && (flags & CLOSURE_SLEEPING)) |
@@ -59,7 +37,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) | |||
59 | 37 | ||
60 | if (!r) { | 38 | if (!r) { |
61 | if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { | 39 | if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { |
62 | /* CLOSURE_BLOCKING might be set - clear it */ | ||
63 | atomic_set(&cl->remaining, | 40 | atomic_set(&cl->remaining, |
64 | CLOSURE_REMAINING_INITIALIZER); | 41 | CLOSURE_REMAINING_INITIALIZER); |
65 | closure_queue(cl); | 42 | closure_queue(cl); |
@@ -90,13 +67,13 @@ void closure_sub(struct closure *cl, int v) | |||
90 | { | 67 | { |
91 | closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); | 68 | closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); |
92 | } | 69 | } |
93 | EXPORT_SYMBOL_GPL(closure_sub); | 70 | EXPORT_SYMBOL(closure_sub); |
94 | 71 | ||
95 | void closure_put(struct closure *cl) | 72 | void closure_put(struct closure *cl) |
96 | { | 73 | { |
97 | closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); | 74 | closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); |
98 | } | 75 | } |
99 | EXPORT_SYMBOL_GPL(closure_put); | 76 | EXPORT_SYMBOL(closure_put); |
100 | 77 | ||
101 | static void set_waiting(struct closure *cl, unsigned long f) | 78 | static void set_waiting(struct closure *cl, unsigned long f) |
102 | { | 79 | { |
@@ -133,7 +110,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list) | |||
133 | closure_sub(cl, CLOSURE_WAITING + 1); | 110 | closure_sub(cl, CLOSURE_WAITING + 1); |
134 | } | 111 | } |
135 | } | 112 | } |
136 | EXPORT_SYMBOL_GPL(__closure_wake_up); | 113 | EXPORT_SYMBOL(__closure_wake_up); |
137 | 114 | ||
138 | bool closure_wait(struct closure_waitlist *list, struct closure *cl) | 115 | bool closure_wait(struct closure_waitlist *list, struct closure *cl) |
139 | { | 116 | { |
@@ -146,7 +123,7 @@ bool closure_wait(struct closure_waitlist *list, struct closure *cl) | |||
146 | 123 | ||
147 | return true; | 124 | return true; |
148 | } | 125 | } |
149 | EXPORT_SYMBOL_GPL(closure_wait); | 126 | EXPORT_SYMBOL(closure_wait); |
150 | 127 | ||
151 | /** | 128 | /** |
152 | * closure_sync() - sleep until a closure a closure has nothing left to wait on | 129 | * closure_sync() - sleep until a closure a closure has nothing left to wait on |
@@ -169,7 +146,7 @@ void closure_sync(struct closure *cl) | |||
169 | 146 | ||
170 | __closure_end_sleep(cl); | 147 | __closure_end_sleep(cl); |
171 | } | 148 | } |
172 | EXPORT_SYMBOL_GPL(closure_sync); | 149 | EXPORT_SYMBOL(closure_sync); |
173 | 150 | ||
174 | /** | 151 | /** |
175 | * closure_trylock() - try to acquire the closure, without waiting | 152 | * closure_trylock() - try to acquire the closure, without waiting |
@@ -183,17 +160,17 @@ bool closure_trylock(struct closure *cl, struct closure *parent) | |||
183 | CLOSURE_REMAINING_INITIALIZER) != -1) | 160 | CLOSURE_REMAINING_INITIALIZER) != -1) |
184 | return false; | 161 | return false; |
185 | 162 | ||
186 | closure_set_ret_ip(cl); | ||
187 | |||
188 | smp_mb(); | 163 | smp_mb(); |
164 | |||
189 | cl->parent = parent; | 165 | cl->parent = parent; |
190 | if (parent) | 166 | if (parent) |
191 | closure_get(parent); | 167 | closure_get(parent); |
192 | 168 | ||
169 | closure_set_ret_ip(cl); | ||
193 | closure_debug_create(cl); | 170 | closure_debug_create(cl); |
194 | return true; | 171 | return true; |
195 | } | 172 | } |
196 | EXPORT_SYMBOL_GPL(closure_trylock); | 173 | EXPORT_SYMBOL(closure_trylock); |
197 | 174 | ||
198 | void __closure_lock(struct closure *cl, struct closure *parent, | 175 | void __closure_lock(struct closure *cl, struct closure *parent, |
199 | struct closure_waitlist *wait_list) | 176 | struct closure_waitlist *wait_list) |
@@ -205,57 +182,11 @@ void __closure_lock(struct closure *cl, struct closure *parent, | |||
205 | if (closure_trylock(cl, parent)) | 182 | if (closure_trylock(cl, parent)) |
206 | return; | 183 | return; |
207 | 184 | ||
208 | closure_wait_event_sync(wait_list, &wait, | 185 | closure_wait_event(wait_list, &wait, |
209 | atomic_read(&cl->remaining) == -1); | 186 | atomic_read(&cl->remaining) == -1); |
210 | } | 187 | } |
211 | } | 188 | } |
212 | EXPORT_SYMBOL_GPL(__closure_lock); | 189 | EXPORT_SYMBOL(__closure_lock); |
213 | |||
214 | static void closure_delay_timer_fn(unsigned long data) | ||
215 | { | ||
216 | struct closure *cl = (struct closure *) data; | ||
217 | closure_sub(cl, CLOSURE_TIMER + 1); | ||
218 | } | ||
219 | |||
220 | void do_closure_timer_init(struct closure *cl) | ||
221 | { | ||
222 | struct timer_list *timer = closure_timer(cl); | ||
223 | |||
224 | init_timer(timer); | ||
225 | timer->data = (unsigned long) cl; | ||
226 | timer->function = closure_delay_timer_fn; | ||
227 | } | ||
228 | EXPORT_SYMBOL_GPL(do_closure_timer_init); | ||
229 | |||
230 | bool __closure_delay(struct closure *cl, unsigned long delay, | ||
231 | struct timer_list *timer) | ||
232 | { | ||
233 | if (atomic_read(&cl->remaining) & CLOSURE_TIMER) | ||
234 | return false; | ||
235 | |||
236 | BUG_ON(timer_pending(timer)); | ||
237 | |||
238 | timer->expires = jiffies + delay; | ||
239 | |||
240 | atomic_add(CLOSURE_TIMER + 1, &cl->remaining); | ||
241 | add_timer(timer); | ||
242 | return true; | ||
243 | } | ||
244 | EXPORT_SYMBOL_GPL(__closure_delay); | ||
245 | |||
246 | void __closure_flush(struct closure *cl, struct timer_list *timer) | ||
247 | { | ||
248 | if (del_timer(timer)) | ||
249 | closure_sub(cl, CLOSURE_TIMER + 1); | ||
250 | } | ||
251 | EXPORT_SYMBOL_GPL(__closure_flush); | ||
252 | |||
253 | void __closure_flush_sync(struct closure *cl, struct timer_list *timer) | ||
254 | { | ||
255 | if (del_timer_sync(timer)) | ||
256 | closure_sub(cl, CLOSURE_TIMER + 1); | ||
257 | } | ||
258 | EXPORT_SYMBOL_GPL(__closure_flush_sync); | ||
259 | 190 | ||
260 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | 191 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG |
261 | 192 | ||
@@ -273,7 +204,7 @@ void closure_debug_create(struct closure *cl) | |||
273 | list_add(&cl->all, &closure_list); | 204 | list_add(&cl->all, &closure_list); |
274 | spin_unlock_irqrestore(&closure_list_lock, flags); | 205 | spin_unlock_irqrestore(&closure_list_lock, flags); |
275 | } | 206 | } |
276 | EXPORT_SYMBOL_GPL(closure_debug_create); | 207 | EXPORT_SYMBOL(closure_debug_create); |
277 | 208 | ||
278 | void closure_debug_destroy(struct closure *cl) | 209 | void closure_debug_destroy(struct closure *cl) |
279 | { | 210 | { |
@@ -286,7 +217,7 @@ void closure_debug_destroy(struct closure *cl) | |||
286 | list_del(&cl->all); | 217 | list_del(&cl->all); |
287 | spin_unlock_irqrestore(&closure_list_lock, flags); | 218 | spin_unlock_irqrestore(&closure_list_lock, flags); |
288 | } | 219 | } |
289 | EXPORT_SYMBOL_GPL(closure_debug_destroy); | 220 | EXPORT_SYMBOL(closure_debug_destroy); |
290 | 221 | ||
291 | static struct dentry *debug; | 222 | static struct dentry *debug; |
292 | 223 | ||
@@ -304,14 +235,12 @@ static int debug_seq_show(struct seq_file *f, void *data) | |||
304 | cl, (void *) cl->ip, cl->fn, cl->parent, | 235 | cl, (void *) cl->ip, cl->fn, cl->parent, |
305 | r & CLOSURE_REMAINING_MASK); | 236 | r & CLOSURE_REMAINING_MASK); |
306 | 237 | ||
307 | seq_printf(f, "%s%s%s%s%s%s\n", | 238 | seq_printf(f, "%s%s%s%s\n", |
308 | test_bit(WORK_STRUCT_PENDING, | 239 | test_bit(WORK_STRUCT_PENDING, |
309 | work_data_bits(&cl->work)) ? "Q" : "", | 240 | work_data_bits(&cl->work)) ? "Q" : "", |
310 | r & CLOSURE_RUNNING ? "R" : "", | 241 | r & CLOSURE_RUNNING ? "R" : "", |
311 | r & CLOSURE_BLOCKING ? "B" : "", | ||
312 | r & CLOSURE_STACK ? "S" : "", | 242 | r & CLOSURE_STACK ? "S" : "", |
313 | r & CLOSURE_SLEEPING ? "Sl" : "", | 243 | r & CLOSURE_SLEEPING ? "Sl" : ""); |
314 | r & CLOSURE_TIMER ? "T" : ""); | ||
315 | 244 | ||
316 | if (r & CLOSURE_WAITING) | 245 | if (r & CLOSURE_WAITING) |
317 | seq_printf(f, " W %pF\n", | 246 | seq_printf(f, " W %pF\n", |
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 00039924ea9d..9762f1be3304 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h | |||
@@ -155,21 +155,6 @@ | |||
155 | * delayed_work embeds a work item and a timer_list. The important thing is, use | 155 | * delayed_work embeds a work item and a timer_list. The important thing is, use |
156 | * it exactly like you would a regular closure and closure_put() will magically | 156 | * it exactly like you would a regular closure and closure_put() will magically |
157 | * handle everything for you. | 157 | * handle everything for you. |
158 | * | ||
159 | * We've got closures that embed timers, too. They're called, appropriately | ||
160 | * enough: | ||
161 | * struct closure_with_timer; | ||
162 | * | ||
163 | * This gives you access to closure_delay(). It takes a refcount for a specified | ||
164 | * number of jiffies - you could then call closure_sync() (for a slightly | ||
165 | * convoluted version of msleep()) or continue_at() - which gives you the same | ||
166 | * effect as using a delayed work item, except you can reuse the work_struct | ||
167 | * already embedded in struct closure. | ||
168 | * | ||
169 | * Lastly, there's struct closure_with_waitlist_and_timer. It does what you | ||
170 | * probably expect, if you happen to need the features of both. (You don't | ||
171 | * really want to know how all this is implemented, but if I've done my job | ||
172 | * right you shouldn't have to care). | ||
173 | */ | 158 | */ |
174 | 159 | ||
175 | struct closure; | 160 | struct closure; |
@@ -182,16 +167,11 @@ struct closure_waitlist { | |||
182 | enum closure_type { | 167 | enum closure_type { |
183 | TYPE_closure = 0, | 168 | TYPE_closure = 0, |
184 | TYPE_closure_with_waitlist = 1, | 169 | TYPE_closure_with_waitlist = 1, |
185 | TYPE_closure_with_timer = 2, | 170 | MAX_CLOSURE_TYPE = 1, |
186 | TYPE_closure_with_waitlist_and_timer = 3, | ||
187 | MAX_CLOSURE_TYPE = 3, | ||
188 | }; | 171 | }; |
189 | 172 | ||
190 | enum closure_state { | 173 | enum closure_state { |
191 | /* | 174 | /* |
192 | * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of | ||
193 | * waiting asynchronously | ||
194 | * | ||
195 | * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by | 175 | * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by |
196 | * the thread that owns the closure, and cleared by the thread that's | 176 | * the thread that owns the closure, and cleared by the thread that's |
197 | * waking up the closure. | 177 | * waking up the closure. |
@@ -200,10 +180,6 @@ enum closure_state { | |||
200 | * - indicates that cl->task is valid and closure_put() may wake it up. | 180 | * - indicates that cl->task is valid and closure_put() may wake it up. |
201 | * Only set or cleared by the thread that owns the closure. | 181 | * Only set or cleared by the thread that owns the closure. |
202 | * | 182 | * |
203 | * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure | ||
204 | * has an outstanding timer. Must be set by the thread that owns the | ||
205 | * closure, and cleared by the timer function when the timer goes off. | ||
206 | * | ||
207 | * The rest are for debugging and don't affect behaviour: | 183 | * The rest are for debugging and don't affect behaviour: |
208 | * | 184 | * |
209 | * CLOSURE_RUNNING: Set when a closure is running (i.e. by | 185 | * CLOSURE_RUNNING: Set when a closure is running (i.e. by |
@@ -218,19 +194,17 @@ enum closure_state { | |||
218 | * closure with this flag set | 194 | * closure with this flag set |
219 | */ | 195 | */ |
220 | 196 | ||
221 | CLOSURE_BITS_START = (1 << 19), | 197 | CLOSURE_BITS_START = (1 << 23), |
222 | CLOSURE_DESTRUCTOR = (1 << 19), | 198 | CLOSURE_DESTRUCTOR = (1 << 23), |
223 | CLOSURE_BLOCKING = (1 << 21), | 199 | CLOSURE_WAITING = (1 << 25), |
224 | CLOSURE_WAITING = (1 << 23), | 200 | CLOSURE_SLEEPING = (1 << 27), |
225 | CLOSURE_SLEEPING = (1 << 25), | ||
226 | CLOSURE_TIMER = (1 << 27), | ||
227 | CLOSURE_RUNNING = (1 << 29), | 201 | CLOSURE_RUNNING = (1 << 29), |
228 | CLOSURE_STACK = (1 << 31), | 202 | CLOSURE_STACK = (1 << 31), |
229 | }; | 203 | }; |
230 | 204 | ||
231 | #define CLOSURE_GUARD_MASK \ | 205 | #define CLOSURE_GUARD_MASK \ |
232 | ((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING| \ | 206 | ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \ |
233 | CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1) | 207 | CLOSURE_RUNNING|CLOSURE_STACK) << 1) |
234 | 208 | ||
235 | #define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) | 209 | #define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) |
236 | #define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) | 210 | #define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) |
@@ -268,17 +242,6 @@ struct closure_with_waitlist { | |||
268 | struct closure_waitlist wait; | 242 | struct closure_waitlist wait; |
269 | }; | 243 | }; |
270 | 244 | ||
271 | struct closure_with_timer { | ||
272 | struct closure cl; | ||
273 | struct timer_list timer; | ||
274 | }; | ||
275 | |||
276 | struct closure_with_waitlist_and_timer { | ||
277 | struct closure cl; | ||
278 | struct closure_waitlist wait; | ||
279 | struct timer_list timer; | ||
280 | }; | ||
281 | |||
282 | extern unsigned invalid_closure_type(void); | 245 | extern unsigned invalid_closure_type(void); |
283 | 246 | ||
284 | #define __CLOSURE_TYPE(cl, _t) \ | 247 | #define __CLOSURE_TYPE(cl, _t) \ |
@@ -289,14 +252,11 @@ extern unsigned invalid_closure_type(void); | |||
289 | ( \ | 252 | ( \ |
290 | __CLOSURE_TYPE(cl, closure) \ | 253 | __CLOSURE_TYPE(cl, closure) \ |
291 | __CLOSURE_TYPE(cl, closure_with_waitlist) \ | 254 | __CLOSURE_TYPE(cl, closure_with_waitlist) \ |
292 | __CLOSURE_TYPE(cl, closure_with_timer) \ | ||
293 | __CLOSURE_TYPE(cl, closure_with_waitlist_and_timer) \ | ||
294 | invalid_closure_type() \ | 255 | invalid_closure_type() \ |
295 | ) | 256 | ) |
296 | 257 | ||
297 | void closure_sub(struct closure *cl, int v); | 258 | void closure_sub(struct closure *cl, int v); |
298 | void closure_put(struct closure *cl); | 259 | void closure_put(struct closure *cl); |
299 | void closure_queue(struct closure *cl); | ||
300 | void __closure_wake_up(struct closure_waitlist *list); | 260 | void __closure_wake_up(struct closure_waitlist *list); |
301 | bool closure_wait(struct closure_waitlist *list, struct closure *cl); | 261 | bool closure_wait(struct closure_waitlist *list, struct closure *cl); |
302 | void closure_sync(struct closure *cl); | 262 | void closure_sync(struct closure *cl); |
@@ -305,12 +265,6 @@ bool closure_trylock(struct closure *cl, struct closure *parent); | |||
305 | void __closure_lock(struct closure *cl, struct closure *parent, | 265 | void __closure_lock(struct closure *cl, struct closure *parent, |
306 | struct closure_waitlist *wait_list); | 266 | struct closure_waitlist *wait_list); |
307 | 267 | ||
308 | void do_closure_timer_init(struct closure *cl); | ||
309 | bool __closure_delay(struct closure *cl, unsigned long delay, | ||
310 | struct timer_list *timer); | ||
311 | void __closure_flush(struct closure *cl, struct timer_list *timer); | ||
312 | void __closure_flush_sync(struct closure *cl, struct timer_list *timer); | ||
313 | |||
314 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG | 268 | #ifdef CONFIG_BCACHE_CLOSURES_DEBUG |
315 | 269 | ||
316 | void closure_debug_init(void); | 270 | void closure_debug_init(void); |
@@ -354,11 +308,6 @@ static inline void closure_set_stopped(struct closure *cl) | |||
354 | atomic_sub(CLOSURE_RUNNING, &cl->remaining); | 308 | atomic_sub(CLOSURE_RUNNING, &cl->remaining); |
355 | } | 309 | } |
356 | 310 | ||
357 | static inline bool closure_is_stopped(struct closure *cl) | ||
358 | { | ||
359 | return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING); | ||
360 | } | ||
361 | |||
362 | static inline bool closure_is_unlocked(struct closure *cl) | 311 | static inline bool closure_is_unlocked(struct closure *cl) |
363 | { | 312 | { |
364 | return atomic_read(&cl->remaining) == -1; | 313 | return atomic_read(&cl->remaining) == -1; |
@@ -367,14 +316,6 @@ static inline bool closure_is_unlocked(struct closure *cl) | |||
367 | static inline void do_closure_init(struct closure *cl, struct closure *parent, | 316 | static inline void do_closure_init(struct closure *cl, struct closure *parent, |
368 | bool running) | 317 | bool running) |
369 | { | 318 | { |
370 | switch (cl->type) { | ||
371 | case TYPE_closure_with_timer: | ||
372 | case TYPE_closure_with_waitlist_and_timer: | ||
373 | do_closure_timer_init(cl); | ||
374 | default: | ||
375 | break; | ||
376 | } | ||
377 | |||
378 | cl->parent = parent; | 319 | cl->parent = parent; |
379 | if (parent) | 320 | if (parent) |
380 | closure_get(parent); | 321 | closure_get(parent); |
@@ -429,8 +370,7 @@ do { \ | |||
429 | static inline void closure_init_stack(struct closure *cl) | 370 | static inline void closure_init_stack(struct closure *cl) |
430 | { | 371 | { |
431 | memset(cl, 0, sizeof(struct closure)); | 372 | memset(cl, 0, sizeof(struct closure)); |
432 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER| | 373 | atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); |
433 | CLOSURE_BLOCKING|CLOSURE_STACK); | ||
434 | } | 374 | } |
435 | 375 | ||
436 | /** | 376 | /** |
@@ -461,24 +401,6 @@ do { \ | |||
461 | #define closure_lock(cl, parent) \ | 401 | #define closure_lock(cl, parent) \ |
462 | __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) | 402 | __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait) |
463 | 403 | ||
464 | /** | ||
465 | * closure_delay() - delay some number of jiffies | ||
466 | * @cl: the closure that will sleep | ||
467 | * @delay: the delay in jiffies | ||
468 | * | ||
469 | * Takes a refcount on @cl which will be released after @delay jiffies; this may | ||
470 | * be used to have a function run after a delay with continue_at(), or | ||
471 | * closure_sync() may be used for a convoluted version of msleep(). | ||
472 | */ | ||
473 | #define closure_delay(cl, delay) \ | ||
474 | __closure_delay(__to_internal_closure(cl), delay, &(cl)->timer) | ||
475 | |||
476 | #define closure_flush(cl) \ | ||
477 | __closure_flush(__to_internal_closure(cl), &(cl)->timer) | ||
478 | |||
479 | #define closure_flush_sync(cl) \ | ||
480 | __closure_flush_sync(__to_internal_closure(cl), &(cl)->timer) | ||
481 | |||
482 | static inline void __closure_end_sleep(struct closure *cl) | 404 | static inline void __closure_end_sleep(struct closure *cl) |
483 | { | 405 | { |
484 | __set_current_state(TASK_RUNNING); | 406 | __set_current_state(TASK_RUNNING); |
@@ -498,40 +420,6 @@ static inline void __closure_start_sleep(struct closure *cl) | |||
498 | } | 420 | } |
499 | 421 | ||
500 | /** | 422 | /** |
501 | * closure_blocking() - returns true if the closure is in blocking mode. | ||
502 | * | ||
503 | * If a closure is in blocking mode, closure_wait_event() will sleep until the | ||
504 | * condition is true instead of waiting asynchronously. | ||
505 | */ | ||
506 | static inline bool closure_blocking(struct closure *cl) | ||
507 | { | ||
508 | return atomic_read(&cl->remaining) & CLOSURE_BLOCKING; | ||
509 | } | ||
510 | |||
511 | /** | ||
512 | * set_closure_blocking() - put a closure in blocking mode. | ||
513 | * | ||
514 | * If a closure is in blocking mode, closure_wait_event() will sleep until the | ||
515 | * condition is true instead of waiting asynchronously. | ||
516 | * | ||
517 | * Not thread safe - can only be called by the thread running the closure. | ||
518 | */ | ||
519 | static inline void set_closure_blocking(struct closure *cl) | ||
520 | { | ||
521 | if (!closure_blocking(cl)) | ||
522 | atomic_add(CLOSURE_BLOCKING, &cl->remaining); | ||
523 | } | ||
524 | |||
525 | /* | ||
526 | * Not thread safe - can only be called by the thread running the closure. | ||
527 | */ | ||
528 | static inline void clear_closure_blocking(struct closure *cl) | ||
529 | { | ||
530 | if (closure_blocking(cl)) | ||
531 | atomic_sub(CLOSURE_BLOCKING, &cl->remaining); | ||
532 | } | ||
533 | |||
534 | /** | ||
535 | * closure_wake_up() - wake up all closures on a wait list. | 423 | * closure_wake_up() - wake up all closures on a wait list. |
536 | */ | 424 | */ |
537 | static inline void closure_wake_up(struct closure_waitlist *list) | 425 | static inline void closure_wake_up(struct closure_waitlist *list) |
@@ -561,63 +449,36 @@ static inline void closure_wake_up(struct closure_waitlist *list) | |||
561 | * refcount on our closure. If this was a stack allocated closure, that would be | 449 | * refcount on our closure. If this was a stack allocated closure, that would be |
562 | * bad. | 450 | * bad. |
563 | */ | 451 | */ |
564 | #define __closure_wait_event(list, cl, condition, _block) \ | 452 | #define closure_wait_event(list, cl, condition) \ |
565 | ({ \ | 453 | ({ \ |
566 | bool block = _block; \ | ||
567 | typeof(condition) ret; \ | 454 | typeof(condition) ret; \ |
568 | \ | 455 | \ |
569 | while (1) { \ | 456 | while (1) { \ |
570 | ret = (condition); \ | 457 | ret = (condition); \ |
571 | if (ret) { \ | 458 | if (ret) { \ |
572 | __closure_wake_up(list); \ | 459 | __closure_wake_up(list); \ |
573 | if (block) \ | 460 | closure_sync(cl); \ |
574 | closure_sync(cl); \ | ||
575 | \ | ||
576 | break; \ | 461 | break; \ |
577 | } \ | 462 | } \ |
578 | \ | 463 | \ |
579 | if (block) \ | 464 | __closure_start_sleep(cl); \ |
580 | __closure_start_sleep(cl); \ | ||
581 | \ | ||
582 | if (!closure_wait(list, cl)) { \ | ||
583 | if (!block) \ | ||
584 | break; \ | ||
585 | \ | 465 | \ |
466 | if (!closure_wait(list, cl)) \ | ||
586 | schedule(); \ | 467 | schedule(); \ |
587 | } \ | ||
588 | } \ | 468 | } \ |
589 | \ | 469 | \ |
590 | ret; \ | 470 | ret; \ |
591 | }) | 471 | }) |
592 | 472 | ||
593 | /** | 473 | static inline void closure_queue(struct closure *cl) |
594 | * closure_wait_event() - wait on a condition, synchronously or asynchronously. | 474 | { |
595 | * @list: the wait list to wait on | 475 | struct workqueue_struct *wq = cl->wq; |
596 | * @cl: the closure that is doing the waiting | 476 | if (wq) { |
597 | * @condition: a C expression for the event to wait for | 477 | INIT_WORK(&cl->work, cl->work.func); |
598 | * | 478 | BUG_ON(!queue_work(wq, &cl->work)); |
599 | * If the closure is in blocking mode, sleeps until the @condition evaluates to | 479 | } else |
600 | * true - exactly like wait_event(). | 480 | cl->fn(cl); |
601 | * | 481 | } |
602 | * If the closure is not in blocking mode, waits asynchronously; if the | ||
603 | * condition is currently false the @cl is put onto @list and returns. @list | ||
604 | * owns a refcount on @cl; closure_sync() or continue_at() may be used later to | ||
605 | * wait for another thread to wake up @list, which drops the refcount on @cl. | ||
606 | * | ||
607 | * Returns the value of @condition; @cl will be on @list iff @condition was | ||
608 | * false. | ||
609 | * | ||
610 | * closure_wake_up(@list) must be called after changing any variable that could | ||
611 | * cause @condition to become true. | ||
612 | */ | ||
613 | #define closure_wait_event(list, cl, condition) \ | ||
614 | __closure_wait_event(list, cl, condition, closure_blocking(cl)) | ||
615 | |||
616 | #define closure_wait_event_async(list, cl, condition) \ | ||
617 | __closure_wait_event(list, cl, condition, false) | ||
618 | |||
619 | #define closure_wait_event_sync(list, cl, condition) \ | ||
620 | __closure_wait_event(list, cl, condition, true) | ||
621 | 482 | ||
622 | static inline void set_closure_fn(struct closure *cl, closure_fn *fn, | 483 | static inline void set_closure_fn(struct closure *cl, closure_fn *fn, |
623 | struct workqueue_struct *wq) | 484 | struct workqueue_struct *wq) |
@@ -642,7 +503,7 @@ do { \ | |||
642 | #define continue_at_nobarrier(_cl, _fn, _wq) \ | 503 | #define continue_at_nobarrier(_cl, _fn, _wq) \ |
643 | do { \ | 504 | do { \ |
644 | set_closure_fn(_cl, _fn, _wq); \ | 505 | set_closure_fn(_cl, _fn, _wq); \ |
645 | closure_queue(cl); \ | 506 | closure_queue(_cl); \ |
646 | return; \ | 507 | return; \ |
647 | } while (0) | 508 | } while (0) |
648 | 509 | ||
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 88e6411eab4f..264fcfbd6290 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c | |||
@@ -8,7 +8,6 @@ | |||
8 | #include "bcache.h" | 8 | #include "bcache.h" |
9 | #include "btree.h" | 9 | #include "btree.h" |
10 | #include "debug.h" | 10 | #include "debug.h" |
11 | #include "request.h" | ||
12 | 11 | ||
13 | #include <linux/console.h> | 12 | #include <linux/console.h> |
14 | #include <linux/debugfs.h> | 13 | #include <linux/debugfs.h> |
@@ -77,29 +76,17 @@ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) | |||
77 | return out - buf; | 76 | return out - buf; |
78 | } | 77 | } |
79 | 78 | ||
80 | int bch_btree_to_text(char *buf, size_t size, const struct btree *b) | 79 | #ifdef CONFIG_BCACHE_DEBUG |
81 | { | ||
82 | return scnprintf(buf, size, "%zu level %i/%i", | ||
83 | PTR_BUCKET_NR(b->c, &b->key, 0), | ||
84 | b->level, b->c->root ? b->c->root->level : -1); | ||
85 | } | ||
86 | |||
87 | #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) | ||
88 | |||
89 | static bool skipped_backwards(struct btree *b, struct bkey *k) | ||
90 | { | ||
91 | return bkey_cmp(k, (!b->level) | ||
92 | ? &START_KEY(bkey_next(k)) | ||
93 | : bkey_next(k)) > 0; | ||
94 | } | ||
95 | 80 | ||
96 | static void dump_bset(struct btree *b, struct bset *i) | 81 | static void dump_bset(struct btree *b, struct bset *i) |
97 | { | 82 | { |
98 | struct bkey *k; | 83 | struct bkey *k, *next; |
99 | unsigned j; | 84 | unsigned j; |
100 | char buf[80]; | 85 | char buf[80]; |
101 | 86 | ||
102 | for (k = i->start; k < end(i); k = bkey_next(k)) { | 87 | for (k = i->start; k < end(i); k = next) { |
88 | next = bkey_next(k); | ||
89 | |||
103 | bch_bkey_to_text(buf, sizeof(buf), k); | 90 | bch_bkey_to_text(buf, sizeof(buf), k); |
104 | printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), | 91 | printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), |
105 | (uint64_t *) k - i->d, i->keys, buf); | 92 | (uint64_t *) k - i->d, i->keys, buf); |
@@ -115,15 +102,21 @@ static void dump_bset(struct btree *b, struct bset *i) | |||
115 | 102 | ||
116 | printk(" %s\n", bch_ptr_status(b->c, k)); | 103 | printk(" %s\n", bch_ptr_status(b->c, k)); |
117 | 104 | ||
118 | if (bkey_next(k) < end(i) && | 105 | if (next < end(i) && |
119 | skipped_backwards(b, k)) | 106 | bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0) |
120 | printk(KERN_ERR "Key skipped backwards\n"); | 107 | printk(KERN_ERR "Key skipped backwards\n"); |
121 | } | 108 | } |
122 | } | 109 | } |
123 | 110 | ||
124 | #endif | 111 | static void bch_dump_bucket(struct btree *b) |
112 | { | ||
113 | unsigned i; | ||
125 | 114 | ||
126 | #ifdef CONFIG_BCACHE_DEBUG | 115 | console_lock(); |
116 | for (i = 0; i <= b->nsets; i++) | ||
117 | dump_bset(b, b->sets[i].data); | ||
118 | console_unlock(); | ||
119 | } | ||
127 | 120 | ||
128 | void bch_btree_verify(struct btree *b, struct bset *new) | 121 | void bch_btree_verify(struct btree *b, struct bset *new) |
129 | { | 122 | { |
@@ -176,66 +169,44 @@ void bch_btree_verify(struct btree *b, struct bset *new) | |||
176 | mutex_unlock(&b->c->verify_lock); | 169 | mutex_unlock(&b->c->verify_lock); |
177 | } | 170 | } |
178 | 171 | ||
179 | static void data_verify_endio(struct bio *bio, int error) | 172 | void bch_data_verify(struct cached_dev *dc, struct bio *bio) |
180 | { | ||
181 | struct closure *cl = bio->bi_private; | ||
182 | closure_put(cl); | ||
183 | } | ||
184 | |||
185 | void bch_data_verify(struct search *s) | ||
186 | { | 173 | { |
187 | char name[BDEVNAME_SIZE]; | 174 | char name[BDEVNAME_SIZE]; |
188 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
189 | struct closure *cl = &s->cl; | ||
190 | struct bio *check; | 175 | struct bio *check; |
191 | struct bio_vec *bv; | 176 | struct bio_vec *bv; |
192 | int i; | 177 | int i; |
193 | 178 | ||
194 | if (!s->unaligned_bvec) | 179 | check = bio_clone(bio, GFP_NOIO); |
195 | bio_for_each_segment(bv, s->orig_bio, i) | ||
196 | bv->bv_offset = 0, bv->bv_len = PAGE_SIZE; | ||
197 | |||
198 | check = bio_clone(s->orig_bio, GFP_NOIO); | ||
199 | if (!check) | 180 | if (!check) |
200 | return; | 181 | return; |
201 | 182 | ||
202 | if (bio_alloc_pages(check, GFP_NOIO)) | 183 | if (bio_alloc_pages(check, GFP_NOIO)) |
203 | goto out_put; | 184 | goto out_put; |
204 | 185 | ||
205 | check->bi_rw = READ_SYNC; | 186 | submit_bio_wait(READ_SYNC, check); |
206 | check->bi_private = cl; | ||
207 | check->bi_end_io = data_verify_endio; | ||
208 | |||
209 | closure_bio_submit(check, cl, &dc->disk); | ||
210 | closure_sync(cl); | ||
211 | 187 | ||
212 | bio_for_each_segment(bv, s->orig_bio, i) { | 188 | bio_for_each_segment(bv, bio, i) { |
213 | void *p1 = kmap(bv->bv_page); | 189 | void *p1 = kmap_atomic(bv->bv_page); |
214 | void *p2 = kmap(check->bi_io_vec[i].bv_page); | 190 | void *p2 = page_address(check->bi_io_vec[i].bv_page); |
215 | 191 | ||
216 | if (memcmp(p1 + bv->bv_offset, | 192 | cache_set_err_on(memcmp(p1 + bv->bv_offset, |
217 | p2 + bv->bv_offset, | 193 | p2 + bv->bv_offset, |
218 | bv->bv_len)) | 194 | bv->bv_len), |
219 | printk(KERN_ERR | 195 | dc->disk.c, |
220 | "bcache (%s): verify failed at sector %llu\n", | 196 | "verify failed at dev %s sector %llu", |
221 | bdevname(dc->bdev, name), | 197 | bdevname(dc->bdev, name), |
222 | (uint64_t) s->orig_bio->bi_sector); | 198 | (uint64_t) bio->bi_sector); |
223 | 199 | ||
224 | kunmap(bv->bv_page); | 200 | kunmap_atomic(p1); |
225 | kunmap(check->bi_io_vec[i].bv_page); | ||
226 | } | 201 | } |
227 | 202 | ||
228 | __bio_for_each_segment(bv, check, i, 0) | 203 | bio_for_each_segment_all(bv, check, i) |
229 | __free_page(bv->bv_page); | 204 | __free_page(bv->bv_page); |
230 | out_put: | 205 | out_put: |
231 | bio_put(check); | 206 | bio_put(check); |
232 | } | 207 | } |
233 | 208 | ||
234 | #endif | 209 | int __bch_count_data(struct btree *b) |
235 | |||
236 | #ifdef CONFIG_BCACHE_EDEBUG | ||
237 | |||
238 | unsigned bch_count_data(struct btree *b) | ||
239 | { | 210 | { |
240 | unsigned ret = 0; | 211 | unsigned ret = 0; |
241 | struct btree_iter iter; | 212 | struct btree_iter iter; |
@@ -247,72 +218,60 @@ unsigned bch_count_data(struct btree *b) | |||
247 | return ret; | 218 | return ret; |
248 | } | 219 | } |
249 | 220 | ||
250 | static void vdump_bucket_and_panic(struct btree *b, const char *fmt, | 221 | void __bch_check_keys(struct btree *b, const char *fmt, ...) |
251 | va_list args) | ||
252 | { | ||
253 | unsigned i; | ||
254 | char buf[80]; | ||
255 | |||
256 | console_lock(); | ||
257 | |||
258 | for (i = 0; i <= b->nsets; i++) | ||
259 | dump_bset(b, b->sets[i].data); | ||
260 | |||
261 | vprintk(fmt, args); | ||
262 | |||
263 | console_unlock(); | ||
264 | |||
265 | bch_btree_to_text(buf, sizeof(buf), b); | ||
266 | panic("at %s\n", buf); | ||
267 | } | ||
268 | |||
269 | void bch_check_key_order_msg(struct btree *b, struct bset *i, | ||
270 | const char *fmt, ...) | ||
271 | { | ||
272 | struct bkey *k; | ||
273 | |||
274 | if (!i->keys) | ||
275 | return; | ||
276 | |||
277 | for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k)) | ||
278 | if (skipped_backwards(b, k)) { | ||
279 | va_list args; | ||
280 | va_start(args, fmt); | ||
281 | |||
282 | vdump_bucket_and_panic(b, fmt, args); | ||
283 | va_end(args); | ||
284 | } | ||
285 | } | ||
286 | |||
287 | void bch_check_keys(struct btree *b, const char *fmt, ...) | ||
288 | { | 222 | { |
289 | va_list args; | 223 | va_list args; |
290 | struct bkey *k, *p = NULL; | 224 | struct bkey *k, *p = NULL; |
291 | struct btree_iter iter; | 225 | struct btree_iter iter; |
292 | 226 | const char *err; | |
293 | if (b->level) | ||
294 | return; | ||
295 | 227 | ||
296 | for_each_key(b, k, &iter) { | 228 | for_each_key(b, k, &iter) { |
297 | if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { | 229 | if (!b->level) { |
298 | printk(KERN_ERR "Keys out of order:\n"); | 230 | err = "Keys out of order"; |
299 | goto bug; | 231 | if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) |
300 | } | 232 | goto bug; |
301 | 233 | ||
302 | if (bch_ptr_invalid(b, k)) | 234 | if (bch_ptr_invalid(b, k)) |
303 | continue; | 235 | continue; |
304 | 236 | ||
305 | if (p && bkey_cmp(p, &START_KEY(k)) > 0) { | 237 | err = "Overlapping keys"; |
306 | printk(KERN_ERR "Overlapping keys:\n"); | 238 | if (p && bkey_cmp(p, &START_KEY(k)) > 0) |
307 | goto bug; | 239 | goto bug; |
240 | } else { | ||
241 | if (bch_ptr_bad(b, k)) | ||
242 | continue; | ||
243 | |||
244 | err = "Duplicate keys"; | ||
245 | if (p && !bkey_cmp(p, k)) | ||
246 | goto bug; | ||
308 | } | 247 | } |
309 | p = k; | 248 | p = k; |
310 | } | 249 | } |
250 | |||
251 | err = "Key larger than btree node key"; | ||
252 | if (p && bkey_cmp(p, &b->key) > 0) | ||
253 | goto bug; | ||
254 | |||
311 | return; | 255 | return; |
312 | bug: | 256 | bug: |
257 | bch_dump_bucket(b); | ||
258 | |||
313 | va_start(args, fmt); | 259 | va_start(args, fmt); |
314 | vdump_bucket_and_panic(b, fmt, args); | 260 | vprintk(fmt, args); |
315 | va_end(args); | 261 | va_end(args); |
262 | |||
263 | panic("bcache error: %s:\n", err); | ||
264 | } | ||
265 | |||
266 | void bch_btree_iter_next_check(struct btree_iter *iter) | ||
267 | { | ||
268 | struct bkey *k = iter->data->k, *next = bkey_next(k); | ||
269 | |||
270 | if (next < iter->data->end && | ||
271 | bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) { | ||
272 | bch_dump_bucket(iter->b); | ||
273 | panic("Key skipped backwards\n"); | ||
274 | } | ||
316 | } | 275 | } |
317 | 276 | ||
318 | #endif | 277 | #endif |
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 1c39b5a2489b..2ede60e31874 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h | |||
@@ -4,40 +4,44 @@ | |||
4 | /* Btree/bkey debug printing */ | 4 | /* Btree/bkey debug printing */ |
5 | 5 | ||
6 | int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); | 6 | int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); |
7 | int bch_btree_to_text(char *buf, size_t size, const struct btree *b); | ||
8 | |||
9 | #ifdef CONFIG_BCACHE_EDEBUG | ||
10 | |||
11 | unsigned bch_count_data(struct btree *); | ||
12 | void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...); | ||
13 | void bch_check_keys(struct btree *, const char *, ...); | ||
14 | |||
15 | #define bch_check_key_order(b, i) \ | ||
16 | bch_check_key_order_msg(b, i, "keys out of order") | ||
17 | #define EBUG_ON(cond) BUG_ON(cond) | ||
18 | |||
19 | #else /* EDEBUG */ | ||
20 | |||
21 | #define bch_count_data(b) 0 | ||
22 | #define bch_check_key_order(b, i) do {} while (0) | ||
23 | #define bch_check_key_order_msg(b, i, ...) do {} while (0) | ||
24 | #define bch_check_keys(b, ...) do {} while (0) | ||
25 | #define EBUG_ON(cond) do {} while (0) | ||
26 | |||
27 | #endif | ||
28 | 7 | ||
29 | #ifdef CONFIG_BCACHE_DEBUG | 8 | #ifdef CONFIG_BCACHE_DEBUG |
30 | 9 | ||
31 | void bch_btree_verify(struct btree *, struct bset *); | 10 | void bch_btree_verify(struct btree *, struct bset *); |
32 | void bch_data_verify(struct search *); | 11 | void bch_data_verify(struct cached_dev *, struct bio *); |
12 | int __bch_count_data(struct btree *); | ||
13 | void __bch_check_keys(struct btree *, const char *, ...); | ||
14 | void bch_btree_iter_next_check(struct btree_iter *); | ||
15 | |||
16 | #define EBUG_ON(cond) BUG_ON(cond) | ||
17 | #define expensive_debug_checks(c) ((c)->expensive_debug_checks) | ||
18 | #define key_merging_disabled(c) ((c)->key_merging_disabled) | ||
19 | #define bypass_torture_test(d) ((d)->bypass_torture_test) | ||
33 | 20 | ||
34 | #else /* DEBUG */ | 21 | #else /* DEBUG */ |
35 | 22 | ||
36 | static inline void bch_btree_verify(struct btree *b, struct bset *i) {} | 23 | static inline void bch_btree_verify(struct btree *b, struct bset *i) {} |
37 | static inline void bch_data_verify(struct search *s) {}; | 24 | static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} |
25 | static inline int __bch_count_data(struct btree *b) { return -1; } | ||
26 | static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {} | ||
27 | static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} | ||
28 | |||
29 | #define EBUG_ON(cond) do { if (cond); } while (0) | ||
30 | #define expensive_debug_checks(c) 0 | ||
31 | #define key_merging_disabled(c) 0 | ||
32 | #define bypass_torture_test(d) 0 | ||
38 | 33 | ||
39 | #endif | 34 | #endif |
40 | 35 | ||
36 | #define bch_count_data(b) \ | ||
37 | (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1) | ||
38 | |||
39 | #define bch_check_keys(b, ...) \ | ||
40 | do { \ | ||
41 | if (expensive_debug_checks((b)->c)) \ | ||
42 | __bch_check_keys(b, __VA_ARGS__); \ | ||
43 | } while (0) | ||
44 | |||
41 | #ifdef CONFIG_DEBUG_FS | 45 | #ifdef CONFIG_DEBUG_FS |
42 | void bch_debug_init_cache_set(struct cache_set *); | 46 | void bch_debug_init_cache_set(struct cache_set *); |
43 | #else | 47 | #else |
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8435f81e5d85..ecdaa671bd50 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c | |||
@@ -7,7 +7,6 @@ | |||
7 | #include "bcache.h" | 7 | #include "bcache.h" |
8 | #include "btree.h" | 8 | #include "btree.h" |
9 | #include "debug.h" | 9 | #include "debug.h" |
10 | #include "request.h" | ||
11 | 10 | ||
12 | #include <trace/events/bcache.h> | 11 | #include <trace/events/bcache.h> |
13 | 12 | ||
@@ -31,17 +30,20 @@ static void journal_read_endio(struct bio *bio, int error) | |||
31 | } | 30 | } |
32 | 31 | ||
33 | static int journal_read_bucket(struct cache *ca, struct list_head *list, | 32 | static int journal_read_bucket(struct cache *ca, struct list_head *list, |
34 | struct btree_op *op, unsigned bucket_index) | 33 | unsigned bucket_index) |
35 | { | 34 | { |
36 | struct journal_device *ja = &ca->journal; | 35 | struct journal_device *ja = &ca->journal; |
37 | struct bio *bio = &ja->bio; | 36 | struct bio *bio = &ja->bio; |
38 | 37 | ||
39 | struct journal_replay *i; | 38 | struct journal_replay *i; |
40 | struct jset *j, *data = ca->set->journal.w[0].data; | 39 | struct jset *j, *data = ca->set->journal.w[0].data; |
40 | struct closure cl; | ||
41 | unsigned len, left, offset = 0; | 41 | unsigned len, left, offset = 0; |
42 | int ret = 0; | 42 | int ret = 0; |
43 | sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); | 43 | sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); |
44 | 44 | ||
45 | closure_init_stack(&cl); | ||
46 | |||
45 | pr_debug("reading %llu", (uint64_t) bucket); | 47 | pr_debug("reading %llu", (uint64_t) bucket); |
46 | 48 | ||
47 | while (offset < ca->sb.bucket_size) { | 49 | while (offset < ca->sb.bucket_size) { |
@@ -55,11 +57,11 @@ reread: left = ca->sb.bucket_size - offset; | |||
55 | bio->bi_size = len << 9; | 57 | bio->bi_size = len << 9; |
56 | 58 | ||
57 | bio->bi_end_io = journal_read_endio; | 59 | bio->bi_end_io = journal_read_endio; |
58 | bio->bi_private = &op->cl; | 60 | bio->bi_private = &cl; |
59 | bch_bio_map(bio, data); | 61 | bch_bio_map(bio, data); |
60 | 62 | ||
61 | closure_bio_submit(bio, &op->cl, ca); | 63 | closure_bio_submit(bio, &cl, ca); |
62 | closure_sync(&op->cl); | 64 | closure_sync(&cl); |
63 | 65 | ||
64 | /* This function could be simpler now since we no longer write | 66 | /* This function could be simpler now since we no longer write |
65 | * journal entries that overlap bucket boundaries; this means | 67 | * journal entries that overlap bucket boundaries; this means |
@@ -72,7 +74,7 @@ reread: left = ca->sb.bucket_size - offset; | |||
72 | struct list_head *where; | 74 | struct list_head *where; |
73 | size_t blocks, bytes = set_bytes(j); | 75 | size_t blocks, bytes = set_bytes(j); |
74 | 76 | ||
75 | if (j->magic != jset_magic(ca->set)) | 77 | if (j->magic != jset_magic(&ca->sb)) |
76 | return ret; | 78 | return ret; |
77 | 79 | ||
78 | if (bytes > left << 9) | 80 | if (bytes > left << 9) |
@@ -129,12 +131,11 @@ next_set: | |||
129 | return ret; | 131 | return ret; |
130 | } | 132 | } |
131 | 133 | ||
132 | int bch_journal_read(struct cache_set *c, struct list_head *list, | 134 | int bch_journal_read(struct cache_set *c, struct list_head *list) |
133 | struct btree_op *op) | ||
134 | { | 135 | { |
135 | #define read_bucket(b) \ | 136 | #define read_bucket(b) \ |
136 | ({ \ | 137 | ({ \ |
137 | int ret = journal_read_bucket(ca, list, op, b); \ | 138 | int ret = journal_read_bucket(ca, list, b); \ |
138 | __set_bit(b, bitmap); \ | 139 | __set_bit(b, bitmap); \ |
139 | if (ret < 0) \ | 140 | if (ret < 0) \ |
140 | return ret; \ | 141 | return ret; \ |
@@ -292,8 +293,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) | |||
292 | } | 293 | } |
293 | } | 294 | } |
294 | 295 | ||
295 | int bch_journal_replay(struct cache_set *s, struct list_head *list, | 296 | int bch_journal_replay(struct cache_set *s, struct list_head *list) |
296 | struct btree_op *op) | ||
297 | { | 297 | { |
298 | int ret = 0, keys = 0, entries = 0; | 298 | int ret = 0, keys = 0, entries = 0; |
299 | struct bkey *k; | 299 | struct bkey *k; |
@@ -301,31 +301,30 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, | |||
301 | list_entry(list->prev, struct journal_replay, list); | 301 | list_entry(list->prev, struct journal_replay, list); |
302 | 302 | ||
303 | uint64_t start = i->j.last_seq, end = i->j.seq, n = start; | 303 | uint64_t start = i->j.last_seq, end = i->j.seq, n = start; |
304 | struct keylist keylist; | ||
305 | |||
306 | bch_keylist_init(&keylist); | ||
304 | 307 | ||
305 | list_for_each_entry(i, list, list) { | 308 | list_for_each_entry(i, list, list) { |
306 | BUG_ON(i->pin && atomic_read(i->pin) != 1); | 309 | BUG_ON(i->pin && atomic_read(i->pin) != 1); |
307 | 310 | ||
308 | if (n != i->j.seq) | 311 | cache_set_err_on(n != i->j.seq, s, |
309 | pr_err( | 312 | "bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", |
310 | "journal entries %llu-%llu missing! (replaying %llu-%llu)\n", | 313 | n, i->j.seq - 1, start, end); |
311 | n, i->j.seq - 1, start, end); | ||
312 | 314 | ||
313 | for (k = i->j.start; | 315 | for (k = i->j.start; |
314 | k < end(&i->j); | 316 | k < end(&i->j); |
315 | k = bkey_next(k)) { | 317 | k = bkey_next(k)) { |
316 | trace_bcache_journal_replay_key(k); | 318 | trace_bcache_journal_replay_key(k); |
317 | 319 | ||
318 | bkey_copy(op->keys.top, k); | 320 | bkey_copy(keylist.top, k); |
319 | bch_keylist_push(&op->keys); | 321 | bch_keylist_push(&keylist); |
320 | |||
321 | op->journal = i->pin; | ||
322 | atomic_inc(op->journal); | ||
323 | 322 | ||
324 | ret = bch_btree_insert(op, s); | 323 | ret = bch_btree_insert(s, &keylist, i->pin, NULL); |
325 | if (ret) | 324 | if (ret) |
326 | goto err; | 325 | goto err; |
327 | 326 | ||
328 | BUG_ON(!bch_keylist_empty(&op->keys)); | 327 | BUG_ON(!bch_keylist_empty(&keylist)); |
329 | keys++; | 328 | keys++; |
330 | 329 | ||
331 | cond_resched(); | 330 | cond_resched(); |
@@ -339,14 +338,13 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, | |||
339 | 338 | ||
340 | pr_info("journal replay done, %i keys in %i entries, seq %llu", | 339 | pr_info("journal replay done, %i keys in %i entries, seq %llu", |
341 | keys, entries, end); | 340 | keys, entries, end); |
342 | 341 | err: | |
343 | while (!list_empty(list)) { | 342 | while (!list_empty(list)) { |
344 | i = list_first_entry(list, struct journal_replay, list); | 343 | i = list_first_entry(list, struct journal_replay, list); |
345 | list_del(&i->list); | 344 | list_del(&i->list); |
346 | kfree(i); | 345 | kfree(i); |
347 | } | 346 | } |
348 | err: | 347 | |
349 | closure_sync(&op->cl); | ||
350 | return ret; | 348 | return ret; |
351 | } | 349 | } |
352 | 350 | ||
@@ -358,48 +356,35 @@ static void btree_flush_write(struct cache_set *c) | |||
358 | * Try to find the btree node with that references the oldest journal | 356 | * Try to find the btree node with that references the oldest journal |
359 | * entry, best is our current candidate and is locked if non NULL: | 357 | * entry, best is our current candidate and is locked if non NULL: |
360 | */ | 358 | */ |
361 | struct btree *b, *best = NULL; | 359 | struct btree *b, *best; |
362 | unsigned iter; | 360 | unsigned i; |
361 | retry: | ||
362 | best = NULL; | ||
363 | |||
364 | for_each_cached_btree(b, c, i) | ||
365 | if (btree_current_write(b)->journal) { | ||
366 | if (!best) | ||
367 | best = b; | ||
368 | else if (journal_pin_cmp(c, | ||
369 | btree_current_write(best)->journal, | ||
370 | btree_current_write(b)->journal)) { | ||
371 | best = b; | ||
372 | } | ||
373 | } | ||
363 | 374 | ||
364 | for_each_cached_btree(b, c, iter) { | 375 | b = best; |
365 | if (!down_write_trylock(&b->lock)) | 376 | if (b) { |
366 | continue; | 377 | rw_lock(true, b, b->level); |
367 | 378 | ||
368 | if (!btree_node_dirty(b) || | 379 | if (!btree_current_write(b)->journal) { |
369 | !btree_current_write(b)->journal) { | ||
370 | rw_unlock(true, b); | 380 | rw_unlock(true, b); |
371 | continue; | 381 | /* We raced */ |
382 | goto retry; | ||
372 | } | 383 | } |
373 | 384 | ||
374 | if (!best) | 385 | bch_btree_node_write(b, NULL); |
375 | best = b; | 386 | rw_unlock(true, b); |
376 | else if (journal_pin_cmp(c, | ||
377 | btree_current_write(best), | ||
378 | btree_current_write(b))) { | ||
379 | rw_unlock(true, best); | ||
380 | best = b; | ||
381 | } else | ||
382 | rw_unlock(true, b); | ||
383 | } | 387 | } |
384 | |||
385 | if (best) | ||
386 | goto out; | ||
387 | |||
388 | /* We can't find the best btree node, just pick the first */ | ||
389 | list_for_each_entry(b, &c->btree_cache, list) | ||
390 | if (!b->level && btree_node_dirty(b)) { | ||
391 | best = b; | ||
392 | rw_lock(true, best, best->level); | ||
393 | goto found; | ||
394 | } | ||
395 | |||
396 | out: | ||
397 | if (!best) | ||
398 | return; | ||
399 | found: | ||
400 | if (btree_node_dirty(best)) | ||
401 | bch_btree_node_write(best, NULL); | ||
402 | rw_unlock(true, best); | ||
403 | } | 388 | } |
404 | 389 | ||
405 | #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) | 390 | #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) |
@@ -495,7 +480,7 @@ static void journal_reclaim(struct cache_set *c) | |||
495 | do_journal_discard(ca); | 480 | do_journal_discard(ca); |
496 | 481 | ||
497 | if (c->journal.blocks_free) | 482 | if (c->journal.blocks_free) |
498 | return; | 483 | goto out; |
499 | 484 | ||
500 | /* | 485 | /* |
501 | * Allocate: | 486 | * Allocate: |
@@ -521,7 +506,7 @@ static void journal_reclaim(struct cache_set *c) | |||
521 | 506 | ||
522 | if (n) | 507 | if (n) |
523 | c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; | 508 | c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; |
524 | 509 | out: | |
525 | if (!journal_full(&c->journal)) | 510 | if (!journal_full(&c->journal)) |
526 | __closure_wake_up(&c->journal.wait); | 511 | __closure_wake_up(&c->journal.wait); |
527 | } | 512 | } |
@@ -554,32 +539,26 @@ static void journal_write_endio(struct bio *bio, int error) | |||
554 | struct journal_write *w = bio->bi_private; | 539 | struct journal_write *w = bio->bi_private; |
555 | 540 | ||
556 | cache_set_err_on(error, w->c, "journal io error"); | 541 | cache_set_err_on(error, w->c, "journal io error"); |
557 | closure_put(&w->c->journal.io.cl); | 542 | closure_put(&w->c->journal.io); |
558 | } | 543 | } |
559 | 544 | ||
560 | static void journal_write(struct closure *); | 545 | static void journal_write(struct closure *); |
561 | 546 | ||
562 | static void journal_write_done(struct closure *cl) | 547 | static void journal_write_done(struct closure *cl) |
563 | { | 548 | { |
564 | struct journal *j = container_of(cl, struct journal, io.cl); | 549 | struct journal *j = container_of(cl, struct journal, io); |
565 | struct cache_set *c = container_of(j, struct cache_set, journal); | ||
566 | |||
567 | struct journal_write *w = (j->cur == j->w) | 550 | struct journal_write *w = (j->cur == j->w) |
568 | ? &j->w[1] | 551 | ? &j->w[1] |
569 | : &j->w[0]; | 552 | : &j->w[0]; |
570 | 553 | ||
571 | __closure_wake_up(&w->wait); | 554 | __closure_wake_up(&w->wait); |
572 | 555 | continue_at_nobarrier(cl, journal_write, system_wq); | |
573 | if (c->journal_delay_ms) | ||
574 | closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms)); | ||
575 | |||
576 | continue_at(cl, journal_write, system_wq); | ||
577 | } | 556 | } |
578 | 557 | ||
579 | static void journal_write_unlocked(struct closure *cl) | 558 | static void journal_write_unlocked(struct closure *cl) |
580 | __releases(c->journal.lock) | 559 | __releases(c->journal.lock) |
581 | { | 560 | { |
582 | struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); | 561 | struct cache_set *c = container_of(cl, struct cache_set, journal.io); |
583 | struct cache *ca; | 562 | struct cache *ca; |
584 | struct journal_write *w = c->journal.cur; | 563 | struct journal_write *w = c->journal.cur; |
585 | struct bkey *k = &c->journal.key; | 564 | struct bkey *k = &c->journal.key; |
@@ -617,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl) | |||
617 | for_each_cache(ca, c, i) | 596 | for_each_cache(ca, c, i) |
618 | w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; | 597 | w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; |
619 | 598 | ||
620 | w->data->magic = jset_magic(c); | 599 | w->data->magic = jset_magic(&c->sb); |
621 | w->data->version = BCACHE_JSET_VERSION; | 600 | w->data->version = BCACHE_JSET_VERSION; |
622 | w->data->last_seq = last_seq(&c->journal); | 601 | w->data->last_seq = last_seq(&c->journal); |
623 | w->data->csum = csum_set(w->data); | 602 | w->data->csum = csum_set(w->data); |
@@ -660,121 +639,134 @@ static void journal_write_unlocked(struct closure *cl) | |||
660 | 639 | ||
661 | static void journal_write(struct closure *cl) | 640 | static void journal_write(struct closure *cl) |
662 | { | 641 | { |
663 | struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); | 642 | struct cache_set *c = container_of(cl, struct cache_set, journal.io); |
664 | 643 | ||
665 | spin_lock(&c->journal.lock); | 644 | spin_lock(&c->journal.lock); |
666 | journal_write_unlocked(cl); | 645 | journal_write_unlocked(cl); |
667 | } | 646 | } |
668 | 647 | ||
669 | static void __journal_try_write(struct cache_set *c, bool noflush) | 648 | static void journal_try_write(struct cache_set *c) |
670 | __releases(c->journal.lock) | 649 | __releases(c->journal.lock) |
671 | { | 650 | { |
672 | struct closure *cl = &c->journal.io.cl; | 651 | struct closure *cl = &c->journal.io; |
652 | struct journal_write *w = c->journal.cur; | ||
673 | 653 | ||
674 | if (!closure_trylock(cl, &c->cl)) | 654 | w->need_write = true; |
675 | spin_unlock(&c->journal.lock); | 655 | |
676 | else if (noflush && journal_full(&c->journal)) { | 656 | if (closure_trylock(cl, &c->cl)) |
677 | spin_unlock(&c->journal.lock); | ||
678 | continue_at(cl, journal_write, system_wq); | ||
679 | } else | ||
680 | journal_write_unlocked(cl); | 657 | journal_write_unlocked(cl); |
658 | else | ||
659 | spin_unlock(&c->journal.lock); | ||
681 | } | 660 | } |
682 | 661 | ||
683 | #define journal_try_write(c) __journal_try_write(c, false) | 662 | static struct journal_write *journal_wait_for_write(struct cache_set *c, |
684 | 663 | unsigned nkeys) | |
685 | void bch_journal_meta(struct cache_set *c, struct closure *cl) | ||
686 | { | 664 | { |
687 | struct journal_write *w; | 665 | size_t sectors; |
666 | struct closure cl; | ||
688 | 667 | ||
689 | if (CACHE_SYNC(&c->sb)) { | 668 | closure_init_stack(&cl); |
690 | spin_lock(&c->journal.lock); | 669 | |
670 | spin_lock(&c->journal.lock); | ||
691 | 671 | ||
692 | w = c->journal.cur; | 672 | while (1) { |
693 | w->need_write = true; | 673 | struct journal_write *w = c->journal.cur; |
694 | 674 | ||
695 | if (cl) | 675 | sectors = __set_blocks(w->data, w->data->keys + nkeys, |
696 | BUG_ON(!closure_wait(&w->wait, cl)); | 676 | c) * c->sb.block_size; |
697 | 677 | ||
698 | closure_flush(&c->journal.io); | 678 | if (sectors <= min_t(size_t, |
699 | __journal_try_write(c, true); | 679 | c->journal.blocks_free * c->sb.block_size, |
680 | PAGE_SECTORS << JSET_BITS)) | ||
681 | return w; | ||
682 | |||
683 | /* XXX: tracepoint */ | ||
684 | if (!journal_full(&c->journal)) { | ||
685 | trace_bcache_journal_entry_full(c); | ||
686 | |||
687 | /* | ||
688 | * XXX: If we were inserting so many keys that they | ||
689 | * won't fit in an _empty_ journal write, we'll | ||
690 | * deadlock. For now, handle this in | ||
691 | * bch_keylist_realloc() - but something to think about. | ||
692 | */ | ||
693 | BUG_ON(!w->data->keys); | ||
694 | |||
695 | closure_wait(&w->wait, &cl); | ||
696 | journal_try_write(c); /* unlocks */ | ||
697 | } else { | ||
698 | trace_bcache_journal_full(c); | ||
699 | |||
700 | closure_wait(&c->journal.wait, &cl); | ||
701 | journal_reclaim(c); | ||
702 | spin_unlock(&c->journal.lock); | ||
703 | |||
704 | btree_flush_write(c); | ||
705 | } | ||
706 | |||
707 | closure_sync(&cl); | ||
708 | spin_lock(&c->journal.lock); | ||
700 | } | 709 | } |
701 | } | 710 | } |
702 | 711 | ||
712 | static void journal_write_work(struct work_struct *work) | ||
713 | { | ||
714 | struct cache_set *c = container_of(to_delayed_work(work), | ||
715 | struct cache_set, | ||
716 | journal.work); | ||
717 | spin_lock(&c->journal.lock); | ||
718 | journal_try_write(c); | ||
719 | } | ||
720 | |||
703 | /* | 721 | /* |
704 | * Entry point to the journalling code - bio_insert() and btree_invalidate() | 722 | * Entry point to the journalling code - bio_insert() and btree_invalidate() |
705 | * pass bch_journal() a list of keys to be journalled, and then | 723 | * pass bch_journal() a list of keys to be journalled, and then |
706 | * bch_journal() hands those same keys off to btree_insert_async() | 724 | * bch_journal() hands those same keys off to btree_insert_async() |
707 | */ | 725 | */ |
708 | 726 | ||
709 | void bch_journal(struct closure *cl) | 727 | atomic_t *bch_journal(struct cache_set *c, |
728 | struct keylist *keys, | ||
729 | struct closure *parent) | ||
710 | { | 730 | { |
711 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
712 | struct cache_set *c = op->c; | ||
713 | struct journal_write *w; | 731 | struct journal_write *w; |
714 | size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list; | 732 | atomic_t *ret; |
715 | |||
716 | if (op->type != BTREE_INSERT || | ||
717 | !CACHE_SYNC(&c->sb)) | ||
718 | goto out; | ||
719 | 733 | ||
720 | /* | 734 | if (!CACHE_SYNC(&c->sb)) |
721 | * If we're looping because we errored, might already be waiting on | 735 | return NULL; |
722 | * another journal write: | ||
723 | */ | ||
724 | while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING) | ||
725 | closure_sync(cl->parent); | ||
726 | 736 | ||
727 | spin_lock(&c->journal.lock); | 737 | w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); |
728 | 738 | ||
729 | if (journal_full(&c->journal)) { | 739 | memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys)); |
730 | trace_bcache_journal_full(c); | 740 | w->data->keys += bch_keylist_nkeys(keys); |
731 | 741 | ||
732 | closure_wait(&c->journal.wait, cl); | 742 | ret = &fifo_back(&c->journal.pin); |
743 | atomic_inc(ret); | ||
733 | 744 | ||
734 | journal_reclaim(c); | 745 | if (parent) { |
746 | closure_wait(&w->wait, parent); | ||
747 | journal_try_write(c); | ||
748 | } else if (!w->need_write) { | ||
749 | schedule_delayed_work(&c->journal.work, | ||
750 | msecs_to_jiffies(c->journal_delay_ms)); | ||
751 | spin_unlock(&c->journal.lock); | ||
752 | } else { | ||
735 | spin_unlock(&c->journal.lock); | 753 | spin_unlock(&c->journal.lock); |
736 | |||
737 | btree_flush_write(c); | ||
738 | continue_at(cl, bch_journal, bcache_wq); | ||
739 | } | 754 | } |
740 | 755 | ||
741 | w = c->journal.cur; | ||
742 | w->need_write = true; | ||
743 | b = __set_blocks(w->data, w->data->keys + n, c); | ||
744 | |||
745 | if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || | ||
746 | b > c->journal.blocks_free) { | ||
747 | trace_bcache_journal_entry_full(c); | ||
748 | |||
749 | /* | ||
750 | * XXX: If we were inserting so many keys that they won't fit in | ||
751 | * an _empty_ journal write, we'll deadlock. For now, handle | ||
752 | * this in bch_keylist_realloc() - but something to think about. | ||
753 | */ | ||
754 | BUG_ON(!w->data->keys); | ||
755 | |||
756 | BUG_ON(!closure_wait(&w->wait, cl)); | ||
757 | |||
758 | closure_flush(&c->journal.io); | ||
759 | 756 | ||
760 | journal_try_write(c); | 757 | return ret; |
761 | continue_at(cl, bch_journal, bcache_wq); | 758 | } |
762 | } | ||
763 | |||
764 | memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t)); | ||
765 | w->data->keys += n; | ||
766 | 759 | ||
767 | op->journal = &fifo_back(&c->journal.pin); | 760 | void bch_journal_meta(struct cache_set *c, struct closure *cl) |
768 | atomic_inc(op->journal); | 761 | { |
762 | struct keylist keys; | ||
763 | atomic_t *ref; | ||
769 | 764 | ||
770 | if (op->flush_journal) { | 765 | bch_keylist_init(&keys); |
771 | closure_flush(&c->journal.io); | ||
772 | closure_wait(&w->wait, cl->parent); | ||
773 | } | ||
774 | 766 | ||
775 | journal_try_write(c); | 767 | ref = bch_journal(c, &keys, cl); |
776 | out: | 768 | if (ref) |
777 | bch_btree_insert_async(cl); | 769 | atomic_dec_bug(ref); |
778 | } | 770 | } |
779 | 771 | ||
780 | void bch_journal_free(struct cache_set *c) | 772 | void bch_journal_free(struct cache_set *c) |
@@ -790,6 +782,7 @@ int bch_journal_alloc(struct cache_set *c) | |||
790 | 782 | ||
791 | closure_init_unlocked(&j->io); | 783 | closure_init_unlocked(&j->io); |
792 | spin_lock_init(&j->lock); | 784 | spin_lock_init(&j->lock); |
785 | INIT_DELAYED_WORK(&j->work, journal_write_work); | ||
793 | 786 | ||
794 | c->journal_delay_ms = 100; | 787 | c->journal_delay_ms = 100; |
795 | 788 | ||
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 3d7851274b04..a6472fda94b2 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h | |||
@@ -75,43 +75,6 @@ | |||
75 | * nodes that are pinning the oldest journal entries first. | 75 | * nodes that are pinning the oldest journal entries first. |
76 | */ | 76 | */ |
77 | 77 | ||
78 | #define BCACHE_JSET_VERSION_UUIDv1 1 | ||
79 | /* Always latest UUID format */ | ||
80 | #define BCACHE_JSET_VERSION_UUID 1 | ||
81 | #define BCACHE_JSET_VERSION 1 | ||
82 | |||
83 | /* | ||
84 | * On disk format for a journal entry: | ||
85 | * seq is monotonically increasing; every journal entry has its own unique | ||
86 | * sequence number. | ||
87 | * | ||
88 | * last_seq is the oldest journal entry that still has keys the btree hasn't | ||
89 | * flushed to disk yet. | ||
90 | * | ||
91 | * version is for on disk format changes. | ||
92 | */ | ||
93 | struct jset { | ||
94 | uint64_t csum; | ||
95 | uint64_t magic; | ||
96 | uint64_t seq; | ||
97 | uint32_t version; | ||
98 | uint32_t keys; | ||
99 | |||
100 | uint64_t last_seq; | ||
101 | |||
102 | BKEY_PADDED(uuid_bucket); | ||
103 | BKEY_PADDED(btree_root); | ||
104 | uint16_t btree_level; | ||
105 | uint16_t pad[3]; | ||
106 | |||
107 | uint64_t prio_bucket[MAX_CACHES_PER_SET]; | ||
108 | |||
109 | union { | ||
110 | struct bkey start[0]; | ||
111 | uint64_t d[0]; | ||
112 | }; | ||
113 | }; | ||
114 | |||
115 | /* | 78 | /* |
116 | * Only used for holding the journal entries we read in btree_journal_read() | 79 | * Only used for holding the journal entries we read in btree_journal_read() |
117 | * during cache_registration | 80 | * during cache_registration |
@@ -140,7 +103,8 @@ struct journal { | |||
140 | spinlock_t lock; | 103 | spinlock_t lock; |
141 | /* used when waiting because the journal was full */ | 104 | /* used when waiting because the journal was full */ |
142 | struct closure_waitlist wait; | 105 | struct closure_waitlist wait; |
143 | struct closure_with_timer io; | 106 | struct closure io; |
107 | struct delayed_work work; | ||
144 | 108 | ||
145 | /* Number of blocks free in the bucket(s) we're currently writing to */ | 109 | /* Number of blocks free in the bucket(s) we're currently writing to */ |
146 | unsigned blocks_free; | 110 | unsigned blocks_free; |
@@ -188,8 +152,7 @@ struct journal_device { | |||
188 | }; | 152 | }; |
189 | 153 | ||
190 | #define journal_pin_cmp(c, l, r) \ | 154 | #define journal_pin_cmp(c, l, r) \ |
191 | (fifo_idx(&(c)->journal.pin, (l)->journal) > \ | 155 | (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) |
192 | fifo_idx(&(c)->journal.pin, (r)->journal)) | ||
193 | 156 | ||
194 | #define JOURNAL_PIN 20000 | 157 | #define JOURNAL_PIN 20000 |
195 | 158 | ||
@@ -199,15 +162,14 @@ struct journal_device { | |||
199 | struct closure; | 162 | struct closure; |
200 | struct cache_set; | 163 | struct cache_set; |
201 | struct btree_op; | 164 | struct btree_op; |
165 | struct keylist; | ||
202 | 166 | ||
203 | void bch_journal(struct closure *); | 167 | atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *); |
204 | void bch_journal_next(struct journal *); | 168 | void bch_journal_next(struct journal *); |
205 | void bch_journal_mark(struct cache_set *, struct list_head *); | 169 | void bch_journal_mark(struct cache_set *, struct list_head *); |
206 | void bch_journal_meta(struct cache_set *, struct closure *); | 170 | void bch_journal_meta(struct cache_set *, struct closure *); |
207 | int bch_journal_read(struct cache_set *, struct list_head *, | 171 | int bch_journal_read(struct cache_set *, struct list_head *); |
208 | struct btree_op *); | 172 | int bch_journal_replay(struct cache_set *, struct list_head *); |
209 | int bch_journal_replay(struct cache_set *, struct list_head *, | ||
210 | struct btree_op *); | ||
211 | 173 | ||
212 | void bch_journal_free(struct cache_set *); | 174 | void bch_journal_free(struct cache_set *); |
213 | int bch_journal_alloc(struct cache_set *); | 175 | int bch_journal_alloc(struct cache_set *); |
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 1a3b4f4786c3..7c1275e66025 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c | |||
@@ -12,8 +12,9 @@ | |||
12 | #include <trace/events/bcache.h> | 12 | #include <trace/events/bcache.h> |
13 | 13 | ||
14 | struct moving_io { | 14 | struct moving_io { |
15 | struct closure cl; | ||
15 | struct keybuf_key *w; | 16 | struct keybuf_key *w; |
16 | struct search s; | 17 | struct data_insert_op op; |
17 | struct bbio bio; | 18 | struct bbio bio; |
18 | }; | 19 | }; |
19 | 20 | ||
@@ -38,13 +39,13 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) | |||
38 | 39 | ||
39 | static void moving_io_destructor(struct closure *cl) | 40 | static void moving_io_destructor(struct closure *cl) |
40 | { | 41 | { |
41 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); | 42 | struct moving_io *io = container_of(cl, struct moving_io, cl); |
42 | kfree(io); | 43 | kfree(io); |
43 | } | 44 | } |
44 | 45 | ||
45 | static void write_moving_finish(struct closure *cl) | 46 | static void write_moving_finish(struct closure *cl) |
46 | { | 47 | { |
47 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); | 48 | struct moving_io *io = container_of(cl, struct moving_io, cl); |
48 | struct bio *bio = &io->bio.bio; | 49 | struct bio *bio = &io->bio.bio; |
49 | struct bio_vec *bv; | 50 | struct bio_vec *bv; |
50 | int i; | 51 | int i; |
@@ -52,13 +53,12 @@ static void write_moving_finish(struct closure *cl) | |||
52 | bio_for_each_segment_all(bv, bio, i) | 53 | bio_for_each_segment_all(bv, bio, i) |
53 | __free_page(bv->bv_page); | 54 | __free_page(bv->bv_page); |
54 | 55 | ||
55 | if (io->s.op.insert_collision) | 56 | if (io->op.replace_collision) |
56 | trace_bcache_gc_copy_collision(&io->w->key); | 57 | trace_bcache_gc_copy_collision(&io->w->key); |
57 | 58 | ||
58 | bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); | 59 | bch_keybuf_del(&io->op.c->moving_gc_keys, io->w); |
59 | 60 | ||
60 | atomic_dec_bug(&io->s.op.c->in_flight); | 61 | up(&io->op.c->moving_in_flight); |
61 | closure_wake_up(&io->s.op.c->moving_gc_wait); | ||
62 | 62 | ||
63 | closure_return_with_destructor(cl, moving_io_destructor); | 63 | closure_return_with_destructor(cl, moving_io_destructor); |
64 | } | 64 | } |
@@ -66,12 +66,12 @@ static void write_moving_finish(struct closure *cl) | |||
66 | static void read_moving_endio(struct bio *bio, int error) | 66 | static void read_moving_endio(struct bio *bio, int error) |
67 | { | 67 | { |
68 | struct moving_io *io = container_of(bio->bi_private, | 68 | struct moving_io *io = container_of(bio->bi_private, |
69 | struct moving_io, s.cl); | 69 | struct moving_io, cl); |
70 | 70 | ||
71 | if (error) | 71 | if (error) |
72 | io->s.error = error; | 72 | io->op.error = error; |
73 | 73 | ||
74 | bch_bbio_endio(io->s.op.c, bio, error, "reading data to move"); | 74 | bch_bbio_endio(io->op.c, bio, error, "reading data to move"); |
75 | } | 75 | } |
76 | 76 | ||
77 | static void moving_init(struct moving_io *io) | 77 | static void moving_init(struct moving_io *io) |
@@ -85,54 +85,53 @@ static void moving_init(struct moving_io *io) | |||
85 | bio->bi_size = KEY_SIZE(&io->w->key) << 9; | 85 | bio->bi_size = KEY_SIZE(&io->w->key) << 9; |
86 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), | 86 | bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), |
87 | PAGE_SECTORS); | 87 | PAGE_SECTORS); |
88 | bio->bi_private = &io->s.cl; | 88 | bio->bi_private = &io->cl; |
89 | bio->bi_io_vec = bio->bi_inline_vecs; | 89 | bio->bi_io_vec = bio->bi_inline_vecs; |
90 | bch_bio_map(bio, NULL); | 90 | bch_bio_map(bio, NULL); |
91 | } | 91 | } |
92 | 92 | ||
93 | static void write_moving(struct closure *cl) | 93 | static void write_moving(struct closure *cl) |
94 | { | 94 | { |
95 | struct search *s = container_of(cl, struct search, cl); | 95 | struct moving_io *io = container_of(cl, struct moving_io, cl); |
96 | struct moving_io *io = container_of(s, struct moving_io, s); | 96 | struct data_insert_op *op = &io->op; |
97 | 97 | ||
98 | if (!s->error) { | 98 | if (!op->error) { |
99 | moving_init(io); | 99 | moving_init(io); |
100 | 100 | ||
101 | io->bio.bio.bi_sector = KEY_START(&io->w->key); | 101 | io->bio.bio.bi_sector = KEY_START(&io->w->key); |
102 | s->op.lock = -1; | 102 | op->write_prio = 1; |
103 | s->op.write_prio = 1; | 103 | op->bio = &io->bio.bio; |
104 | s->op.cache_bio = &io->bio.bio; | ||
105 | 104 | ||
106 | s->writeback = KEY_DIRTY(&io->w->key); | 105 | op->writeback = KEY_DIRTY(&io->w->key); |
107 | s->op.csum = KEY_CSUM(&io->w->key); | 106 | op->csum = KEY_CSUM(&io->w->key); |
108 | 107 | ||
109 | s->op.type = BTREE_REPLACE; | 108 | bkey_copy(&op->replace_key, &io->w->key); |
110 | bkey_copy(&s->op.replace, &io->w->key); | 109 | op->replace = true; |
111 | 110 | ||
112 | closure_init(&s->op.cl, cl); | 111 | closure_call(&op->cl, bch_data_insert, NULL, cl); |
113 | bch_insert_data(&s->op.cl); | ||
114 | } | 112 | } |
115 | 113 | ||
116 | continue_at(cl, write_moving_finish, NULL); | 114 | continue_at(cl, write_moving_finish, system_wq); |
117 | } | 115 | } |
118 | 116 | ||
119 | static void read_moving_submit(struct closure *cl) | 117 | static void read_moving_submit(struct closure *cl) |
120 | { | 118 | { |
121 | struct search *s = container_of(cl, struct search, cl); | 119 | struct moving_io *io = container_of(cl, struct moving_io, cl); |
122 | struct moving_io *io = container_of(s, struct moving_io, s); | ||
123 | struct bio *bio = &io->bio.bio; | 120 | struct bio *bio = &io->bio.bio; |
124 | 121 | ||
125 | bch_submit_bbio(bio, s->op.c, &io->w->key, 0); | 122 | bch_submit_bbio(bio, io->op.c, &io->w->key, 0); |
126 | 123 | ||
127 | continue_at(cl, write_moving, bch_gc_wq); | 124 | continue_at(cl, write_moving, system_wq); |
128 | } | 125 | } |
129 | 126 | ||
130 | static void read_moving(struct closure *cl) | 127 | static void read_moving(struct cache_set *c) |
131 | { | 128 | { |
132 | struct cache_set *c = container_of(cl, struct cache_set, moving_gc); | ||
133 | struct keybuf_key *w; | 129 | struct keybuf_key *w; |
134 | struct moving_io *io; | 130 | struct moving_io *io; |
135 | struct bio *bio; | 131 | struct bio *bio; |
132 | struct closure cl; | ||
133 | |||
134 | closure_init_stack(&cl); | ||
136 | 135 | ||
137 | /* XXX: if we error, background writeback could stall indefinitely */ | 136 | /* XXX: if we error, background writeback could stall indefinitely */ |
138 | 137 | ||
@@ -150,8 +149,8 @@ static void read_moving(struct closure *cl) | |||
150 | 149 | ||
151 | w->private = io; | 150 | w->private = io; |
152 | io->w = w; | 151 | io->w = w; |
153 | io->s.op.inode = KEY_INODE(&w->key); | 152 | io->op.inode = KEY_INODE(&w->key); |
154 | io->s.op.c = c; | 153 | io->op.c = c; |
155 | 154 | ||
156 | moving_init(io); | 155 | moving_init(io); |
157 | bio = &io->bio.bio; | 156 | bio = &io->bio.bio; |
@@ -164,13 +163,8 @@ static void read_moving(struct closure *cl) | |||
164 | 163 | ||
165 | trace_bcache_gc_copy(&w->key); | 164 | trace_bcache_gc_copy(&w->key); |
166 | 165 | ||
167 | closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); | 166 | down(&c->moving_in_flight); |
168 | 167 | closure_call(&io->cl, read_moving_submit, NULL, &cl); | |
169 | if (atomic_inc_return(&c->in_flight) >= 64) { | ||
170 | closure_wait_event(&c->moving_gc_wait, cl, | ||
171 | atomic_read(&c->in_flight) < 64); | ||
172 | continue_at(cl, read_moving, bch_gc_wq); | ||
173 | } | ||
174 | } | 168 | } |
175 | 169 | ||
176 | if (0) { | 170 | if (0) { |
@@ -180,7 +174,7 @@ err: if (!IS_ERR_OR_NULL(w->private)) | |||
180 | bch_keybuf_del(&c->moving_gc_keys, w); | 174 | bch_keybuf_del(&c->moving_gc_keys, w); |
181 | } | 175 | } |
182 | 176 | ||
183 | closure_return(cl); | 177 | closure_sync(&cl); |
184 | } | 178 | } |
185 | 179 | ||
186 | static bool bucket_cmp(struct bucket *l, struct bucket *r) | 180 | static bool bucket_cmp(struct bucket *l, struct bucket *r) |
@@ -193,15 +187,14 @@ static unsigned bucket_heap_top(struct cache *ca) | |||
193 | return GC_SECTORS_USED(heap_peek(&ca->heap)); | 187 | return GC_SECTORS_USED(heap_peek(&ca->heap)); |
194 | } | 188 | } |
195 | 189 | ||
196 | void bch_moving_gc(struct closure *cl) | 190 | void bch_moving_gc(struct cache_set *c) |
197 | { | 191 | { |
198 | struct cache_set *c = container_of(cl, struct cache_set, gc.cl); | ||
199 | struct cache *ca; | 192 | struct cache *ca; |
200 | struct bucket *b; | 193 | struct bucket *b; |
201 | unsigned i; | 194 | unsigned i; |
202 | 195 | ||
203 | if (!c->copy_gc_enabled) | 196 | if (!c->copy_gc_enabled) |
204 | closure_return(cl); | 197 | return; |
205 | 198 | ||
206 | mutex_lock(&c->bucket_lock); | 199 | mutex_lock(&c->bucket_lock); |
207 | 200 | ||
@@ -242,13 +235,11 @@ void bch_moving_gc(struct closure *cl) | |||
242 | 235 | ||
243 | c->moving_gc_keys.last_scanned = ZERO_KEY; | 236 | c->moving_gc_keys.last_scanned = ZERO_KEY; |
244 | 237 | ||
245 | closure_init(&c->moving_gc, cl); | 238 | read_moving(c); |
246 | read_moving(&c->moving_gc); | ||
247 | |||
248 | closure_return(cl); | ||
249 | } | 239 | } |
250 | 240 | ||
251 | void bch_moving_init_cache_set(struct cache_set *c) | 241 | void bch_moving_init_cache_set(struct cache_set *c) |
252 | { | 242 | { |
253 | bch_keybuf_init(&c->moving_gc_keys); | 243 | bch_keybuf_init(&c->moving_gc_keys); |
244 | sema_init(&c->moving_in_flight, 64); | ||
254 | } | 245 | } |
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 2a7f0dd6abab..fbcc851ed5a5 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c | |||
@@ -25,7 +25,7 @@ | |||
25 | 25 | ||
26 | struct kmem_cache *bch_search_cache; | 26 | struct kmem_cache *bch_search_cache; |
27 | 27 | ||
28 | static void check_should_skip(struct cached_dev *, struct search *); | 28 | static void bch_data_insert_start(struct closure *); |
29 | 29 | ||
30 | /* Cgroup interface */ | 30 | /* Cgroup interface */ |
31 | 31 | ||
@@ -213,221 +213,79 @@ static void bio_csum(struct bio *bio, struct bkey *k) | |||
213 | 213 | ||
214 | /* Insert data into cache */ | 214 | /* Insert data into cache */ |
215 | 215 | ||
216 | static void bio_invalidate(struct closure *cl) | 216 | static void bch_data_insert_keys(struct closure *cl) |
217 | { | 217 | { |
218 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 218 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); |
219 | struct bio *bio = op->cache_bio; | 219 | atomic_t *journal_ref = NULL; |
220 | 220 | struct bkey *replace_key = op->replace ? &op->replace_key : NULL; | |
221 | pr_debug("invalidating %i sectors from %llu", | 221 | int ret; |
222 | bio_sectors(bio), (uint64_t) bio->bi_sector); | ||
223 | |||
224 | while (bio_sectors(bio)) { | ||
225 | unsigned len = min(bio_sectors(bio), 1U << 14); | ||
226 | |||
227 | if (bch_keylist_realloc(&op->keys, 0, op->c)) | ||
228 | goto out; | ||
229 | |||
230 | bio->bi_sector += len; | ||
231 | bio->bi_size -= len << 9; | ||
232 | |||
233 | bch_keylist_add(&op->keys, | ||
234 | &KEY(op->inode, bio->bi_sector, len)); | ||
235 | } | ||
236 | |||
237 | op->insert_data_done = true; | ||
238 | bio_put(bio); | ||
239 | out: | ||
240 | continue_at(cl, bch_journal, bcache_wq); | ||
241 | } | ||
242 | |||
243 | struct open_bucket { | ||
244 | struct list_head list; | ||
245 | struct task_struct *last; | ||
246 | unsigned sectors_free; | ||
247 | BKEY_PADDED(key); | ||
248 | }; | ||
249 | |||
250 | void bch_open_buckets_free(struct cache_set *c) | ||
251 | { | ||
252 | struct open_bucket *b; | ||
253 | |||
254 | while (!list_empty(&c->data_buckets)) { | ||
255 | b = list_first_entry(&c->data_buckets, | ||
256 | struct open_bucket, list); | ||
257 | list_del(&b->list); | ||
258 | kfree(b); | ||
259 | } | ||
260 | } | ||
261 | |||
262 | int bch_open_buckets_alloc(struct cache_set *c) | ||
263 | { | ||
264 | int i; | ||
265 | |||
266 | spin_lock_init(&c->data_bucket_lock); | ||
267 | |||
268 | for (i = 0; i < 6; i++) { | ||
269 | struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); | ||
270 | if (!b) | ||
271 | return -ENOMEM; | ||
272 | |||
273 | list_add(&b->list, &c->data_buckets); | ||
274 | } | ||
275 | |||
276 | return 0; | ||
277 | } | ||
278 | |||
279 | /* | ||
280 | * We keep multiple buckets open for writes, and try to segregate different | ||
281 | * write streams for better cache utilization: first we look for a bucket where | ||
282 | * the last write to it was sequential with the current write, and failing that | ||
283 | * we look for a bucket that was last used by the same task. | ||
284 | * | ||
285 | * The ideas is if you've got multiple tasks pulling data into the cache at the | ||
286 | * same time, you'll get better cache utilization if you try to segregate their | ||
287 | * data and preserve locality. | ||
288 | * | ||
289 | * For example, say you've starting Firefox at the same time you're copying a | ||
290 | * bunch of files. Firefox will likely end up being fairly hot and stay in the | ||
291 | * cache awhile, but the data you copied might not be; if you wrote all that | ||
292 | * data to the same buckets it'd get invalidated at the same time. | ||
293 | * | ||
294 | * Both of those tasks will be doing fairly random IO so we can't rely on | ||
295 | * detecting sequential IO to segregate their data, but going off of the task | ||
296 | * should be a sane heuristic. | ||
297 | */ | ||
298 | static struct open_bucket *pick_data_bucket(struct cache_set *c, | ||
299 | const struct bkey *search, | ||
300 | struct task_struct *task, | ||
301 | struct bkey *alloc) | ||
302 | { | ||
303 | struct open_bucket *ret, *ret_task = NULL; | ||
304 | |||
305 | list_for_each_entry_reverse(ret, &c->data_buckets, list) | ||
306 | if (!bkey_cmp(&ret->key, search)) | ||
307 | goto found; | ||
308 | else if (ret->last == task) | ||
309 | ret_task = ret; | ||
310 | |||
311 | ret = ret_task ?: list_first_entry(&c->data_buckets, | ||
312 | struct open_bucket, list); | ||
313 | found: | ||
314 | if (!ret->sectors_free && KEY_PTRS(alloc)) { | ||
315 | ret->sectors_free = c->sb.bucket_size; | ||
316 | bkey_copy(&ret->key, alloc); | ||
317 | bkey_init(alloc); | ||
318 | } | ||
319 | |||
320 | if (!ret->sectors_free) | ||
321 | ret = NULL; | ||
322 | |||
323 | return ret; | ||
324 | } | ||
325 | |||
326 | /* | ||
327 | * Allocates some space in the cache to write to, and k to point to the newly | ||
328 | * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the | ||
329 | * end of the newly allocated space). | ||
330 | * | ||
331 | * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many | ||
332 | * sectors were actually allocated. | ||
333 | * | ||
334 | * If s->writeback is true, will not fail. | ||
335 | */ | ||
336 | static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, | ||
337 | struct search *s) | ||
338 | { | ||
339 | struct cache_set *c = s->op.c; | ||
340 | struct open_bucket *b; | ||
341 | BKEY_PADDED(key) alloc; | ||
342 | struct closure cl, *w = NULL; | ||
343 | unsigned i; | ||
344 | |||
345 | if (s->writeback) { | ||
346 | closure_init_stack(&cl); | ||
347 | w = &cl; | ||
348 | } | ||
349 | 222 | ||
350 | /* | 223 | /* |
351 | * We might have to allocate a new bucket, which we can't do with a | 224 | * If we're looping, might already be waiting on |
352 | * spinlock held. So if we have to allocate, we drop the lock, allocate | 225 | * another journal write - can't wait on more than one journal write at |
353 | * and then retry. KEY_PTRS() indicates whether alloc points to | 226 | * a time |
354 | * allocated bucket(s). | 227 | * |
228 | * XXX: this looks wrong | ||
355 | */ | 229 | */ |
230 | #if 0 | ||
231 | while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING) | ||
232 | closure_sync(&s->cl); | ||
233 | #endif | ||
356 | 234 | ||
357 | bkey_init(&alloc.key); | 235 | if (!op->replace) |
358 | spin_lock(&c->data_bucket_lock); | 236 | journal_ref = bch_journal(op->c, &op->insert_keys, |
359 | 237 | op->flush_journal ? cl : NULL); | |
360 | while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) { | ||
361 | unsigned watermark = s->op.write_prio | ||
362 | ? WATERMARK_MOVINGGC | ||
363 | : WATERMARK_NONE; | ||
364 | |||
365 | spin_unlock(&c->data_bucket_lock); | ||
366 | |||
367 | if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w)) | ||
368 | return false; | ||
369 | 238 | ||
370 | spin_lock(&c->data_bucket_lock); | 239 | ret = bch_btree_insert(op->c, &op->insert_keys, |
240 | journal_ref, replace_key); | ||
241 | if (ret == -ESRCH) { | ||
242 | op->replace_collision = true; | ||
243 | } else if (ret) { | ||
244 | op->error = -ENOMEM; | ||
245 | op->insert_data_done = true; | ||
371 | } | 246 | } |
372 | 247 | ||
373 | /* | 248 | if (journal_ref) |
374 | * If we had to allocate, we might race and not need to allocate the | 249 | atomic_dec_bug(journal_ref); |
375 | * second time we call find_data_bucket(). If we allocated a bucket but | ||
376 | * didn't use it, drop the refcount bch_bucket_alloc_set() took: | ||
377 | */ | ||
378 | if (KEY_PTRS(&alloc.key)) | ||
379 | __bkey_put(c, &alloc.key); | ||
380 | |||
381 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
382 | EBUG_ON(ptr_stale(c, &b->key, i)); | ||
383 | 250 | ||
384 | /* Set up the pointer to the space we're allocating: */ | 251 | if (!op->insert_data_done) |
252 | continue_at(cl, bch_data_insert_start, bcache_wq); | ||
385 | 253 | ||
386 | for (i = 0; i < KEY_PTRS(&b->key); i++) | 254 | bch_keylist_free(&op->insert_keys); |
387 | k->ptr[i] = b->key.ptr[i]; | 255 | closure_return(cl); |
256 | } | ||
388 | 257 | ||
389 | sectors = min(sectors, b->sectors_free); | 258 | static void bch_data_invalidate(struct closure *cl) |
259 | { | ||
260 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); | ||
261 | struct bio *bio = op->bio; | ||
390 | 262 | ||
391 | SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); | 263 | pr_debug("invalidating %i sectors from %llu", |
392 | SET_KEY_SIZE(k, sectors); | 264 | bio_sectors(bio), (uint64_t) bio->bi_sector); |
393 | SET_KEY_PTRS(k, KEY_PTRS(&b->key)); | ||
394 | 265 | ||
395 | /* | 266 | while (bio_sectors(bio)) { |
396 | * Move b to the end of the lru, and keep track of what this bucket was | 267 | unsigned sectors = min(bio_sectors(bio), |
397 | * last used for: | 268 | 1U << (KEY_SIZE_BITS - 1)); |
398 | */ | ||
399 | list_move_tail(&b->list, &c->data_buckets); | ||
400 | bkey_copy_key(&b->key, k); | ||
401 | b->last = s->task; | ||
402 | 269 | ||
403 | b->sectors_free -= sectors; | 270 | if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) |
271 | goto out; | ||
404 | 272 | ||
405 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | 273 | bio->bi_sector += sectors; |
406 | SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); | 274 | bio->bi_size -= sectors << 9; |
407 | 275 | ||
408 | atomic_long_add(sectors, | 276 | bch_keylist_add(&op->insert_keys, |
409 | &PTR_CACHE(c, &b->key, i)->sectors_written); | 277 | &KEY(op->inode, bio->bi_sector, sectors)); |
410 | } | 278 | } |
411 | 279 | ||
412 | if (b->sectors_free < c->sb.block_size) | 280 | op->insert_data_done = true; |
413 | b->sectors_free = 0; | 281 | bio_put(bio); |
414 | 282 | out: | |
415 | /* | 283 | continue_at(cl, bch_data_insert_keys, bcache_wq); |
416 | * k takes refcounts on the buckets it points to until it's inserted | ||
417 | * into the btree, but if we're done with this bucket we just transfer | ||
418 | * get_data_bucket()'s refcount. | ||
419 | */ | ||
420 | if (b->sectors_free) | ||
421 | for (i = 0; i < KEY_PTRS(&b->key); i++) | ||
422 | atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); | ||
423 | |||
424 | spin_unlock(&c->data_bucket_lock); | ||
425 | return true; | ||
426 | } | 284 | } |
427 | 285 | ||
428 | static void bch_insert_data_error(struct closure *cl) | 286 | static void bch_data_insert_error(struct closure *cl) |
429 | { | 287 | { |
430 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 288 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); |
431 | 289 | ||
432 | /* | 290 | /* |
433 | * Our data write just errored, which means we've got a bunch of keys to | 291 | * Our data write just errored, which means we've got a bunch of keys to |
@@ -438,35 +296,34 @@ static void bch_insert_data_error(struct closure *cl) | |||
438 | * from the keys we'll accomplish just that. | 296 | * from the keys we'll accomplish just that. |
439 | */ | 297 | */ |
440 | 298 | ||
441 | struct bkey *src = op->keys.bottom, *dst = op->keys.bottom; | 299 | struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys; |
442 | 300 | ||
443 | while (src != op->keys.top) { | 301 | while (src != op->insert_keys.top) { |
444 | struct bkey *n = bkey_next(src); | 302 | struct bkey *n = bkey_next(src); |
445 | 303 | ||
446 | SET_KEY_PTRS(src, 0); | 304 | SET_KEY_PTRS(src, 0); |
447 | bkey_copy(dst, src); | 305 | memmove(dst, src, bkey_bytes(src)); |
448 | 306 | ||
449 | dst = bkey_next(dst); | 307 | dst = bkey_next(dst); |
450 | src = n; | 308 | src = n; |
451 | } | 309 | } |
452 | 310 | ||
453 | op->keys.top = dst; | 311 | op->insert_keys.top = dst; |
454 | 312 | ||
455 | bch_journal(cl); | 313 | bch_data_insert_keys(cl); |
456 | } | 314 | } |
457 | 315 | ||
458 | static void bch_insert_data_endio(struct bio *bio, int error) | 316 | static void bch_data_insert_endio(struct bio *bio, int error) |
459 | { | 317 | { |
460 | struct closure *cl = bio->bi_private; | 318 | struct closure *cl = bio->bi_private; |
461 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 319 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); |
462 | struct search *s = container_of(op, struct search, op); | ||
463 | 320 | ||
464 | if (error) { | 321 | if (error) { |
465 | /* TODO: We could try to recover from this. */ | 322 | /* TODO: We could try to recover from this. */ |
466 | if (s->writeback) | 323 | if (op->writeback) |
467 | s->error = error; | 324 | op->error = error; |
468 | else if (s->write) | 325 | else if (!op->replace) |
469 | set_closure_fn(cl, bch_insert_data_error, bcache_wq); | 326 | set_closure_fn(cl, bch_data_insert_error, bcache_wq); |
470 | else | 327 | else |
471 | set_closure_fn(cl, NULL, NULL); | 328 | set_closure_fn(cl, NULL, NULL); |
472 | } | 329 | } |
@@ -474,18 +331,17 @@ static void bch_insert_data_endio(struct bio *bio, int error) | |||
474 | bch_bbio_endio(op->c, bio, error, "writing data to cache"); | 331 | bch_bbio_endio(op->c, bio, error, "writing data to cache"); |
475 | } | 332 | } |
476 | 333 | ||
477 | static void bch_insert_data_loop(struct closure *cl) | 334 | static void bch_data_insert_start(struct closure *cl) |
478 | { | 335 | { |
479 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 336 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); |
480 | struct search *s = container_of(op, struct search, op); | 337 | struct bio *bio = op->bio, *n; |
481 | struct bio *bio = op->cache_bio, *n; | ||
482 | 338 | ||
483 | if (op->skip) | 339 | if (op->bypass) |
484 | return bio_invalidate(cl); | 340 | return bch_data_invalidate(cl); |
485 | 341 | ||
486 | if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { | 342 | if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { |
487 | set_gc_sectors(op->c); | 343 | set_gc_sectors(op->c); |
488 | bch_queue_gc(op->c); | 344 | wake_up_gc(op->c); |
489 | } | 345 | } |
490 | 346 | ||
491 | /* | 347 | /* |
@@ -497,29 +353,30 @@ static void bch_insert_data_loop(struct closure *cl) | |||
497 | do { | 353 | do { |
498 | unsigned i; | 354 | unsigned i; |
499 | struct bkey *k; | 355 | struct bkey *k; |
500 | struct bio_set *split = s->d | 356 | struct bio_set *split = op->c->bio_split; |
501 | ? s->d->bio_split : op->c->bio_split; | ||
502 | 357 | ||
503 | /* 1 for the device pointer and 1 for the chksum */ | 358 | /* 1 for the device pointer and 1 for the chksum */ |
504 | if (bch_keylist_realloc(&op->keys, | 359 | if (bch_keylist_realloc(&op->insert_keys, |
505 | 1 + (op->csum ? 1 : 0), | 360 | 1 + (op->csum ? 1 : 0), |
506 | op->c)) | 361 | op->c)) |
507 | continue_at(cl, bch_journal, bcache_wq); | 362 | continue_at(cl, bch_data_insert_keys, bcache_wq); |
508 | 363 | ||
509 | k = op->keys.top; | 364 | k = op->insert_keys.top; |
510 | bkey_init(k); | 365 | bkey_init(k); |
511 | SET_KEY_INODE(k, op->inode); | 366 | SET_KEY_INODE(k, op->inode); |
512 | SET_KEY_OFFSET(k, bio->bi_sector); | 367 | SET_KEY_OFFSET(k, bio->bi_sector); |
513 | 368 | ||
514 | if (!bch_alloc_sectors(k, bio_sectors(bio), s)) | 369 | if (!bch_alloc_sectors(op->c, k, bio_sectors(bio), |
370 | op->write_point, op->write_prio, | ||
371 | op->writeback)) | ||
515 | goto err; | 372 | goto err; |
516 | 373 | ||
517 | n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); | 374 | n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); |
518 | 375 | ||
519 | n->bi_end_io = bch_insert_data_endio; | 376 | n->bi_end_io = bch_data_insert_endio; |
520 | n->bi_private = cl; | 377 | n->bi_private = cl; |
521 | 378 | ||
522 | if (s->writeback) { | 379 | if (op->writeback) { |
523 | SET_KEY_DIRTY(k, true); | 380 | SET_KEY_DIRTY(k, true); |
524 | 381 | ||
525 | for (i = 0; i < KEY_PTRS(k); i++) | 382 | for (i = 0; i < KEY_PTRS(k); i++) |
@@ -532,17 +389,17 @@ static void bch_insert_data_loop(struct closure *cl) | |||
532 | bio_csum(n, k); | 389 | bio_csum(n, k); |
533 | 390 | ||
534 | trace_bcache_cache_insert(k); | 391 | trace_bcache_cache_insert(k); |
535 | bch_keylist_push(&op->keys); | 392 | bch_keylist_push(&op->insert_keys); |
536 | 393 | ||
537 | n->bi_rw |= REQ_WRITE; | 394 | n->bi_rw |= REQ_WRITE; |
538 | bch_submit_bbio(n, op->c, k, 0); | 395 | bch_submit_bbio(n, op->c, k, 0); |
539 | } while (n != bio); | 396 | } while (n != bio); |
540 | 397 | ||
541 | op->insert_data_done = true; | 398 | op->insert_data_done = true; |
542 | continue_at(cl, bch_journal, bcache_wq); | 399 | continue_at(cl, bch_data_insert_keys, bcache_wq); |
543 | err: | 400 | err: |
544 | /* bch_alloc_sectors() blocks if s->writeback = true */ | 401 | /* bch_alloc_sectors() blocks if s->writeback = true */ |
545 | BUG_ON(s->writeback); | 402 | BUG_ON(op->writeback); |
546 | 403 | ||
547 | /* | 404 | /* |
548 | * But if it's not a writeback write we'd rather just bail out if | 405 | * But if it's not a writeback write we'd rather just bail out if |
@@ -550,15 +407,15 @@ err: | |||
550 | * we might be starving btree writes for gc or something. | 407 | * we might be starving btree writes for gc or something. |
551 | */ | 408 | */ |
552 | 409 | ||
553 | if (s->write) { | 410 | if (!op->replace) { |
554 | /* | 411 | /* |
555 | * Writethrough write: We can't complete the write until we've | 412 | * Writethrough write: We can't complete the write until we've |
556 | * updated the index. But we don't want to delay the write while | 413 | * updated the index. But we don't want to delay the write while |
557 | * we wait for buckets to be freed up, so just invalidate the | 414 | * we wait for buckets to be freed up, so just invalidate the |
558 | * rest of the write. | 415 | * rest of the write. |
559 | */ | 416 | */ |
560 | op->skip = true; | 417 | op->bypass = true; |
561 | return bio_invalidate(cl); | 418 | return bch_data_invalidate(cl); |
562 | } else { | 419 | } else { |
563 | /* | 420 | /* |
564 | * From a cache miss, we can just insert the keys for the data | 421 | * From a cache miss, we can just insert the keys for the data |
@@ -567,15 +424,15 @@ err: | |||
567 | op->insert_data_done = true; | 424 | op->insert_data_done = true; |
568 | bio_put(bio); | 425 | bio_put(bio); |
569 | 426 | ||
570 | if (!bch_keylist_empty(&op->keys)) | 427 | if (!bch_keylist_empty(&op->insert_keys)) |
571 | continue_at(cl, bch_journal, bcache_wq); | 428 | continue_at(cl, bch_data_insert_keys, bcache_wq); |
572 | else | 429 | else |
573 | closure_return(cl); | 430 | closure_return(cl); |
574 | } | 431 | } |
575 | } | 432 | } |
576 | 433 | ||
577 | /** | 434 | /** |
578 | * bch_insert_data - stick some data in the cache | 435 | * bch_data_insert - stick some data in the cache |
579 | * | 436 | * |
580 | * This is the starting point for any data to end up in a cache device; it could | 437 | * This is the starting point for any data to end up in a cache device; it could |
581 | * be from a normal write, or a writeback write, or a write to a flash only | 438 | * be from a normal write, or a writeback write, or a write to a flash only |
@@ -587,56 +444,179 @@ err: | |||
587 | * data is written it calls bch_journal, and after the keys have been added to | 444 | * data is written it calls bch_journal, and after the keys have been added to |
588 | * the next journal write they're inserted into the btree. | 445 | * the next journal write they're inserted into the btree. |
589 | * | 446 | * |
590 | * It inserts the data in op->cache_bio; bi_sector is used for the key offset, | 447 | * It inserts the data in s->cache_bio; bi_sector is used for the key offset, |
591 | * and op->inode is used for the key inode. | 448 | * and op->inode is used for the key inode. |
592 | * | 449 | * |
593 | * If op->skip is true, instead of inserting the data it invalidates the region | 450 | * If s->bypass is true, instead of inserting the data it invalidates the |
594 | * of the cache represented by op->cache_bio and op->inode. | 451 | * region of the cache represented by s->cache_bio and op->inode. |
595 | */ | 452 | */ |
596 | void bch_insert_data(struct closure *cl) | 453 | void bch_data_insert(struct closure *cl) |
597 | { | 454 | { |
598 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 455 | struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); |
456 | |||
457 | trace_bcache_write(op->bio, op->writeback, op->bypass); | ||
599 | 458 | ||
600 | bch_keylist_init(&op->keys); | 459 | bch_keylist_init(&op->insert_keys); |
601 | bio_get(op->cache_bio); | 460 | bio_get(op->bio); |
602 | bch_insert_data_loop(cl); | 461 | bch_data_insert_start(cl); |
603 | } | 462 | } |
604 | 463 | ||
605 | void bch_btree_insert_async(struct closure *cl) | 464 | /* Congested? */ |
465 | |||
466 | unsigned bch_get_congested(struct cache_set *c) | ||
606 | { | 467 | { |
607 | struct btree_op *op = container_of(cl, struct btree_op, cl); | 468 | int i; |
608 | struct search *s = container_of(op, struct search, op); | 469 | long rand; |
609 | 470 | ||
610 | if (bch_btree_insert(op, op->c)) { | 471 | if (!c->congested_read_threshold_us && |
611 | s->error = -ENOMEM; | 472 | !c->congested_write_threshold_us) |
612 | op->insert_data_done = true; | 473 | return 0; |
613 | } | 474 | |
475 | i = (local_clock_us() - c->congested_last_us) / 1024; | ||
476 | if (i < 0) | ||
477 | return 0; | ||
478 | |||
479 | i += atomic_read(&c->congested); | ||
480 | if (i >= 0) | ||
481 | return 0; | ||
614 | 482 | ||
615 | if (op->insert_data_done) { | 483 | i += CONGESTED_MAX; |
616 | bch_keylist_free(&op->keys); | 484 | |
617 | closure_return(cl); | 485 | if (i > 0) |
618 | } else | 486 | i = fract_exp_two(i, 6); |
619 | continue_at(cl, bch_insert_data_loop, bcache_wq); | 487 | |
488 | rand = get_random_int(); | ||
489 | i -= bitmap_weight(&rand, BITS_PER_LONG); | ||
490 | |||
491 | return i > 0 ? i : 1; | ||
620 | } | 492 | } |
621 | 493 | ||
622 | /* Common code for the make_request functions */ | 494 | static void add_sequential(struct task_struct *t) |
495 | { | ||
496 | ewma_add(t->sequential_io_avg, | ||
497 | t->sequential_io, 8, 0); | ||
623 | 498 | ||
624 | static void request_endio(struct bio *bio, int error) | 499 | t->sequential_io = 0; |
500 | } | ||
501 | |||
502 | static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) | ||
625 | { | 503 | { |
626 | struct closure *cl = bio->bi_private; | 504 | return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; |
505 | } | ||
627 | 506 | ||
628 | if (error) { | 507 | static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) |
629 | struct search *s = container_of(cl, struct search, cl); | 508 | { |
630 | s->error = error; | 509 | struct cache_set *c = dc->disk.c; |
631 | /* Only cache read errors are recoverable */ | 510 | unsigned mode = cache_mode(dc, bio); |
632 | s->recoverable = false; | 511 | unsigned sectors, congested = bch_get_congested(c); |
512 | struct task_struct *task = current; | ||
513 | struct io *i; | ||
514 | |||
515 | if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || | ||
516 | c->gc_stats.in_use > CUTOFF_CACHE_ADD || | ||
517 | (bio->bi_rw & REQ_DISCARD)) | ||
518 | goto skip; | ||
519 | |||
520 | if (mode == CACHE_MODE_NONE || | ||
521 | (mode == CACHE_MODE_WRITEAROUND && | ||
522 | (bio->bi_rw & REQ_WRITE))) | ||
523 | goto skip; | ||
524 | |||
525 | if (bio->bi_sector & (c->sb.block_size - 1) || | ||
526 | bio_sectors(bio) & (c->sb.block_size - 1)) { | ||
527 | pr_debug("skipping unaligned io"); | ||
528 | goto skip; | ||
633 | } | 529 | } |
634 | 530 | ||
635 | bio_put(bio); | 531 | if (bypass_torture_test(dc)) { |
636 | closure_put(cl); | 532 | if ((get_random_int() & 3) == 3) |
533 | goto skip; | ||
534 | else | ||
535 | goto rescale; | ||
536 | } | ||
537 | |||
538 | if (!congested && !dc->sequential_cutoff) | ||
539 | goto rescale; | ||
540 | |||
541 | if (!congested && | ||
542 | mode == CACHE_MODE_WRITEBACK && | ||
543 | (bio->bi_rw & REQ_WRITE) && | ||
544 | (bio->bi_rw & REQ_SYNC)) | ||
545 | goto rescale; | ||
546 | |||
547 | spin_lock(&dc->io_lock); | ||
548 | |||
549 | hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) | ||
550 | if (i->last == bio->bi_sector && | ||
551 | time_before(jiffies, i->jiffies)) | ||
552 | goto found; | ||
553 | |||
554 | i = list_first_entry(&dc->io_lru, struct io, lru); | ||
555 | |||
556 | add_sequential(task); | ||
557 | i->sequential = 0; | ||
558 | found: | ||
559 | if (i->sequential + bio->bi_size > i->sequential) | ||
560 | i->sequential += bio->bi_size; | ||
561 | |||
562 | i->last = bio_end_sector(bio); | ||
563 | i->jiffies = jiffies + msecs_to_jiffies(5000); | ||
564 | task->sequential_io = i->sequential; | ||
565 | |||
566 | hlist_del(&i->hash); | ||
567 | hlist_add_head(&i->hash, iohash(dc, i->last)); | ||
568 | list_move_tail(&i->lru, &dc->io_lru); | ||
569 | |||
570 | spin_unlock(&dc->io_lock); | ||
571 | |||
572 | sectors = max(task->sequential_io, | ||
573 | task->sequential_io_avg) >> 9; | ||
574 | |||
575 | if (dc->sequential_cutoff && | ||
576 | sectors >= dc->sequential_cutoff >> 9) { | ||
577 | trace_bcache_bypass_sequential(bio); | ||
578 | goto skip; | ||
579 | } | ||
580 | |||
581 | if (congested && sectors >= congested) { | ||
582 | trace_bcache_bypass_congested(bio); | ||
583 | goto skip; | ||
584 | } | ||
585 | |||
586 | rescale: | ||
587 | bch_rescale_priorities(c, bio_sectors(bio)); | ||
588 | return false; | ||
589 | skip: | ||
590 | bch_mark_sectors_bypassed(c, dc, bio_sectors(bio)); | ||
591 | return true; | ||
637 | } | 592 | } |
638 | 593 | ||
639 | void bch_cache_read_endio(struct bio *bio, int error) | 594 | /* Cache lookup */ |
595 | |||
596 | struct search { | ||
597 | /* Stack frame for bio_complete */ | ||
598 | struct closure cl; | ||
599 | |||
600 | struct bcache_device *d; | ||
601 | |||
602 | struct bbio bio; | ||
603 | struct bio *orig_bio; | ||
604 | struct bio *cache_miss; | ||
605 | |||
606 | unsigned insert_bio_sectors; | ||
607 | |||
608 | unsigned recoverable:1; | ||
609 | unsigned unaligned_bvec:1; | ||
610 | unsigned write:1; | ||
611 | unsigned read_dirty_data:1; | ||
612 | |||
613 | unsigned long start_time; | ||
614 | |||
615 | struct btree_op op; | ||
616 | struct data_insert_op iop; | ||
617 | }; | ||
618 | |||
619 | static void bch_cache_read_endio(struct bio *bio, int error) | ||
640 | { | 620 | { |
641 | struct bbio *b = container_of(bio, struct bbio, bio); | 621 | struct bbio *b = container_of(bio, struct bbio, bio); |
642 | struct closure *cl = bio->bi_private; | 622 | struct closure *cl = bio->bi_private; |
@@ -650,13 +630,113 @@ void bch_cache_read_endio(struct bio *bio, int error) | |||
650 | */ | 630 | */ |
651 | 631 | ||
652 | if (error) | 632 | if (error) |
653 | s->error = error; | 633 | s->iop.error = error; |
654 | else if (ptr_stale(s->op.c, &b->key, 0)) { | 634 | else if (ptr_stale(s->iop.c, &b->key, 0)) { |
655 | atomic_long_inc(&s->op.c->cache_read_races); | 635 | atomic_long_inc(&s->iop.c->cache_read_races); |
656 | s->error = -EINTR; | 636 | s->iop.error = -EINTR; |
657 | } | 637 | } |
658 | 638 | ||
659 | bch_bbio_endio(s->op.c, bio, error, "reading from cache"); | 639 | bch_bbio_endio(s->iop.c, bio, error, "reading from cache"); |
640 | } | ||
641 | |||
642 | /* | ||
643 | * Read from a single key, handling the initial cache miss if the key starts in | ||
644 | * the middle of the bio | ||
645 | */ | ||
646 | static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) | ||
647 | { | ||
648 | struct search *s = container_of(op, struct search, op); | ||
649 | struct bio *n, *bio = &s->bio.bio; | ||
650 | struct bkey *bio_key; | ||
651 | unsigned ptr; | ||
652 | |||
653 | if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0) | ||
654 | return MAP_CONTINUE; | ||
655 | |||
656 | if (KEY_INODE(k) != s->iop.inode || | ||
657 | KEY_START(k) > bio->bi_sector) { | ||
658 | unsigned bio_sectors = bio_sectors(bio); | ||
659 | unsigned sectors = KEY_INODE(k) == s->iop.inode | ||
660 | ? min_t(uint64_t, INT_MAX, | ||
661 | KEY_START(k) - bio->bi_sector) | ||
662 | : INT_MAX; | ||
663 | |||
664 | int ret = s->d->cache_miss(b, s, bio, sectors); | ||
665 | if (ret != MAP_CONTINUE) | ||
666 | return ret; | ||
667 | |||
668 | /* if this was a complete miss we shouldn't get here */ | ||
669 | BUG_ON(bio_sectors <= sectors); | ||
670 | } | ||
671 | |||
672 | if (!KEY_SIZE(k)) | ||
673 | return MAP_CONTINUE; | ||
674 | |||
675 | /* XXX: figure out best pointer - for multiple cache devices */ | ||
676 | ptr = 0; | ||
677 | |||
678 | PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; | ||
679 | |||
680 | if (KEY_DIRTY(k)) | ||
681 | s->read_dirty_data = true; | ||
682 | |||
683 | n = bch_bio_split(bio, min_t(uint64_t, INT_MAX, | ||
684 | KEY_OFFSET(k) - bio->bi_sector), | ||
685 | GFP_NOIO, s->d->bio_split); | ||
686 | |||
687 | bio_key = &container_of(n, struct bbio, bio)->key; | ||
688 | bch_bkey_copy_single_ptr(bio_key, k, ptr); | ||
689 | |||
690 | bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key); | ||
691 | bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key); | ||
692 | |||
693 | n->bi_end_io = bch_cache_read_endio; | ||
694 | n->bi_private = &s->cl; | ||
695 | |||
696 | /* | ||
697 | * The bucket we're reading from might be reused while our bio | ||
698 | * is in flight, and we could then end up reading the wrong | ||
699 | * data. | ||
700 | * | ||
701 | * We guard against this by checking (in cache_read_endio()) if | ||
702 | * the pointer is stale again; if so, we treat it as an error | ||
703 | * and reread from the backing device (but we don't pass that | ||
704 | * error up anywhere). | ||
705 | */ | ||
706 | |||
707 | __bch_submit_bbio(n, b->c); | ||
708 | return n == bio ? MAP_DONE : MAP_CONTINUE; | ||
709 | } | ||
710 | |||
711 | static void cache_lookup(struct closure *cl) | ||
712 | { | ||
713 | struct search *s = container_of(cl, struct search, iop.cl); | ||
714 | struct bio *bio = &s->bio.bio; | ||
715 | |||
716 | int ret = bch_btree_map_keys(&s->op, s->iop.c, | ||
717 | &KEY(s->iop.inode, bio->bi_sector, 0), | ||
718 | cache_lookup_fn, MAP_END_KEY); | ||
719 | if (ret == -EAGAIN) | ||
720 | continue_at(cl, cache_lookup, bcache_wq); | ||
721 | |||
722 | closure_return(cl); | ||
723 | } | ||
724 | |||
725 | /* Common code for the make_request functions */ | ||
726 | |||
727 | static void request_endio(struct bio *bio, int error) | ||
728 | { | ||
729 | struct closure *cl = bio->bi_private; | ||
730 | |||
731 | if (error) { | ||
732 | struct search *s = container_of(cl, struct search, cl); | ||
733 | s->iop.error = error; | ||
734 | /* Only cache read errors are recoverable */ | ||
735 | s->recoverable = false; | ||
736 | } | ||
737 | |||
738 | bio_put(bio); | ||
739 | closure_put(cl); | ||
660 | } | 740 | } |
661 | 741 | ||
662 | static void bio_complete(struct search *s) | 742 | static void bio_complete(struct search *s) |
@@ -670,8 +750,8 @@ static void bio_complete(struct search *s) | |||
670 | part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); | 750 | part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); |
671 | part_stat_unlock(); | 751 | part_stat_unlock(); |
672 | 752 | ||
673 | trace_bcache_request_end(s, s->orig_bio); | 753 | trace_bcache_request_end(s->d, s->orig_bio); |
674 | bio_endio(s->orig_bio, s->error); | 754 | bio_endio(s->orig_bio, s->iop.error); |
675 | s->orig_bio = NULL; | 755 | s->orig_bio = NULL; |
676 | } | 756 | } |
677 | } | 757 | } |
@@ -691,8 +771,8 @@ static void search_free(struct closure *cl) | |||
691 | struct search *s = container_of(cl, struct search, cl); | 771 | struct search *s = container_of(cl, struct search, cl); |
692 | bio_complete(s); | 772 | bio_complete(s); |
693 | 773 | ||
694 | if (s->op.cache_bio) | 774 | if (s->iop.bio) |
695 | bio_put(s->op.cache_bio); | 775 | bio_put(s->iop.bio); |
696 | 776 | ||
697 | if (s->unaligned_bvec) | 777 | if (s->unaligned_bvec) |
698 | mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); | 778 | mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec); |
@@ -703,21 +783,22 @@ static void search_free(struct closure *cl) | |||
703 | 783 | ||
704 | static struct search *search_alloc(struct bio *bio, struct bcache_device *d) | 784 | static struct search *search_alloc(struct bio *bio, struct bcache_device *d) |
705 | { | 785 | { |
786 | struct search *s; | ||
706 | struct bio_vec *bv; | 787 | struct bio_vec *bv; |
707 | struct search *s = mempool_alloc(d->c->search, GFP_NOIO); | 788 | |
708 | memset(s, 0, offsetof(struct search, op.keys)); | 789 | s = mempool_alloc(d->c->search, GFP_NOIO); |
790 | memset(s, 0, offsetof(struct search, iop.insert_keys)); | ||
709 | 791 | ||
710 | __closure_init(&s->cl, NULL); | 792 | __closure_init(&s->cl, NULL); |
711 | 793 | ||
712 | s->op.inode = d->id; | 794 | s->iop.inode = d->id; |
713 | s->op.c = d->c; | 795 | s->iop.c = d->c; |
714 | s->d = d; | 796 | s->d = d; |
715 | s->op.lock = -1; | 797 | s->op.lock = -1; |
716 | s->task = current; | 798 | s->iop.write_point = hash_long((unsigned long) current, 16); |
717 | s->orig_bio = bio; | 799 | s->orig_bio = bio; |
718 | s->write = (bio->bi_rw & REQ_WRITE) != 0; | 800 | s->write = (bio->bi_rw & REQ_WRITE) != 0; |
719 | s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; | 801 | s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; |
720 | s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; | ||
721 | s->recoverable = 1; | 802 | s->recoverable = 1; |
722 | s->start_time = jiffies; | 803 | s->start_time = jiffies; |
723 | do_bio_hook(s); | 804 | do_bio_hook(s); |
@@ -734,18 +815,6 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) | |||
734 | return s; | 815 | return s; |
735 | } | 816 | } |
736 | 817 | ||
737 | static void btree_read_async(struct closure *cl) | ||
738 | { | ||
739 | struct btree_op *op = container_of(cl, struct btree_op, cl); | ||
740 | |||
741 | int ret = btree_root(search_recurse, op->c, op); | ||
742 | |||
743 | if (ret == -EAGAIN) | ||
744 | continue_at(cl, btree_read_async, bcache_wq); | ||
745 | |||
746 | closure_return(cl); | ||
747 | } | ||
748 | |||
749 | /* Cached devices */ | 818 | /* Cached devices */ |
750 | 819 | ||
751 | static void cached_dev_bio_complete(struct closure *cl) | 820 | static void cached_dev_bio_complete(struct closure *cl) |
@@ -759,27 +828,28 @@ static void cached_dev_bio_complete(struct closure *cl) | |||
759 | 828 | ||
760 | /* Process reads */ | 829 | /* Process reads */ |
761 | 830 | ||
762 | static void cached_dev_read_complete(struct closure *cl) | 831 | static void cached_dev_cache_miss_done(struct closure *cl) |
763 | { | 832 | { |
764 | struct search *s = container_of(cl, struct search, cl); | 833 | struct search *s = container_of(cl, struct search, cl); |
765 | 834 | ||
766 | if (s->op.insert_collision) | 835 | if (s->iop.replace_collision) |
767 | bch_mark_cache_miss_collision(s); | 836 | bch_mark_cache_miss_collision(s->iop.c, s->d); |
768 | 837 | ||
769 | if (s->op.cache_bio) { | 838 | if (s->iop.bio) { |
770 | int i; | 839 | int i; |
771 | struct bio_vec *bv; | 840 | struct bio_vec *bv; |
772 | 841 | ||
773 | __bio_for_each_segment(bv, s->op.cache_bio, i, 0) | 842 | bio_for_each_segment_all(bv, s->iop.bio, i) |
774 | __free_page(bv->bv_page); | 843 | __free_page(bv->bv_page); |
775 | } | 844 | } |
776 | 845 | ||
777 | cached_dev_bio_complete(cl); | 846 | cached_dev_bio_complete(cl); |
778 | } | 847 | } |
779 | 848 | ||
780 | static void request_read_error(struct closure *cl) | 849 | static void cached_dev_read_error(struct closure *cl) |
781 | { | 850 | { |
782 | struct search *s = container_of(cl, struct search, cl); | 851 | struct search *s = container_of(cl, struct search, cl); |
852 | struct bio *bio = &s->bio.bio; | ||
783 | struct bio_vec *bv; | 853 | struct bio_vec *bv; |
784 | int i; | 854 | int i; |
785 | 855 | ||
@@ -787,7 +857,7 @@ static void request_read_error(struct closure *cl) | |||
787 | /* Retry from the backing device: */ | 857 | /* Retry from the backing device: */ |
788 | trace_bcache_read_retry(s->orig_bio); | 858 | trace_bcache_read_retry(s->orig_bio); |
789 | 859 | ||
790 | s->error = 0; | 860 | s->iop.error = 0; |
791 | bv = s->bio.bio.bi_io_vec; | 861 | bv = s->bio.bio.bi_io_vec; |
792 | do_bio_hook(s); | 862 | do_bio_hook(s); |
793 | s->bio.bio.bi_io_vec = bv; | 863 | s->bio.bio.bi_io_vec = bv; |
@@ -803,146 +873,148 @@ static void request_read_error(struct closure *cl) | |||
803 | 873 | ||
804 | /* XXX: invalidate cache */ | 874 | /* XXX: invalidate cache */ |
805 | 875 | ||
806 | closure_bio_submit(&s->bio.bio, &s->cl, s->d); | 876 | closure_bio_submit(bio, cl, s->d); |
807 | } | 877 | } |
808 | 878 | ||
809 | continue_at(cl, cached_dev_read_complete, NULL); | 879 | continue_at(cl, cached_dev_cache_miss_done, NULL); |
810 | } | 880 | } |
811 | 881 | ||
812 | static void request_read_done(struct closure *cl) | 882 | static void cached_dev_read_done(struct closure *cl) |
813 | { | 883 | { |
814 | struct search *s = container_of(cl, struct search, cl); | 884 | struct search *s = container_of(cl, struct search, cl); |
815 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 885 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
816 | 886 | ||
817 | /* | 887 | /* |
818 | * s->cache_bio != NULL implies that we had a cache miss; cache_bio now | 888 | * We had a cache miss; cache_bio now contains data ready to be inserted |
819 | * contains data ready to be inserted into the cache. | 889 | * into the cache. |
820 | * | 890 | * |
821 | * First, we copy the data we just read from cache_bio's bounce buffers | 891 | * First, we copy the data we just read from cache_bio's bounce buffers |
822 | * to the buffers the original bio pointed to: | 892 | * to the buffers the original bio pointed to: |
823 | */ | 893 | */ |
824 | 894 | ||
825 | if (s->op.cache_bio) { | 895 | if (s->iop.bio) { |
826 | bio_reset(s->op.cache_bio); | 896 | bio_reset(s->iop.bio); |
827 | s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; | 897 | s->iop.bio->bi_sector = s->cache_miss->bi_sector; |
828 | s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; | 898 | s->iop.bio->bi_bdev = s->cache_miss->bi_bdev; |
829 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; | 899 | s->iop.bio->bi_size = s->insert_bio_sectors << 9; |
830 | bch_bio_map(s->op.cache_bio, NULL); | 900 | bch_bio_map(s->iop.bio, NULL); |
831 | 901 | ||
832 | bio_copy_data(s->cache_miss, s->op.cache_bio); | 902 | bio_copy_data(s->cache_miss, s->iop.bio); |
833 | 903 | ||
834 | bio_put(s->cache_miss); | 904 | bio_put(s->cache_miss); |
835 | s->cache_miss = NULL; | 905 | s->cache_miss = NULL; |
836 | } | 906 | } |
837 | 907 | ||
838 | if (verify(dc, &s->bio.bio) && s->recoverable) | 908 | if (verify(dc, &s->bio.bio) && s->recoverable && |
839 | bch_data_verify(s); | 909 | !s->unaligned_bvec && !s->read_dirty_data) |
910 | bch_data_verify(dc, s->orig_bio); | ||
840 | 911 | ||
841 | bio_complete(s); | 912 | bio_complete(s); |
842 | 913 | ||
843 | if (s->op.cache_bio && | 914 | if (s->iop.bio && |
844 | !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) { | 915 | !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { |
845 | s->op.type = BTREE_REPLACE; | 916 | BUG_ON(!s->iop.replace); |
846 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | 917 | closure_call(&s->iop.cl, bch_data_insert, NULL, cl); |
847 | } | 918 | } |
848 | 919 | ||
849 | continue_at(cl, cached_dev_read_complete, NULL); | 920 | continue_at(cl, cached_dev_cache_miss_done, NULL); |
850 | } | 921 | } |
851 | 922 | ||
852 | static void request_read_done_bh(struct closure *cl) | 923 | static void cached_dev_read_done_bh(struct closure *cl) |
853 | { | 924 | { |
854 | struct search *s = container_of(cl, struct search, cl); | 925 | struct search *s = container_of(cl, struct search, cl); |
855 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 926 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
856 | 927 | ||
857 | bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); | 928 | bch_mark_cache_accounting(s->iop.c, s->d, |
858 | trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); | 929 | !s->cache_miss, s->iop.bypass); |
930 | trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); | ||
859 | 931 | ||
860 | if (s->error) | 932 | if (s->iop.error) |
861 | continue_at_nobarrier(cl, request_read_error, bcache_wq); | 933 | continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); |
862 | else if (s->op.cache_bio || verify(dc, &s->bio.bio)) | 934 | else if (s->iop.bio || verify(dc, &s->bio.bio)) |
863 | continue_at_nobarrier(cl, request_read_done, bcache_wq); | 935 | continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); |
864 | else | 936 | else |
865 | continue_at_nobarrier(cl, cached_dev_read_complete, NULL); | 937 | continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); |
866 | } | 938 | } |
867 | 939 | ||
868 | static int cached_dev_cache_miss(struct btree *b, struct search *s, | 940 | static int cached_dev_cache_miss(struct btree *b, struct search *s, |
869 | struct bio *bio, unsigned sectors) | 941 | struct bio *bio, unsigned sectors) |
870 | { | 942 | { |
871 | int ret = 0; | 943 | int ret = MAP_CONTINUE; |
872 | unsigned reada; | 944 | unsigned reada = 0; |
873 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 945 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
874 | struct bio *miss; | 946 | struct bio *miss, *cache_bio; |
875 | |||
876 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | ||
877 | if (miss == bio) | ||
878 | s->op.lookup_done = true; | ||
879 | 947 | ||
880 | miss->bi_end_io = request_endio; | 948 | if (s->cache_miss || s->iop.bypass) { |
881 | miss->bi_private = &s->cl; | 949 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); |
882 | 950 | ret = miss == bio ? MAP_DONE : MAP_CONTINUE; | |
883 | if (s->cache_miss || s->op.skip) | ||
884 | goto out_submit; | 951 | goto out_submit; |
885 | |||
886 | if (miss != bio || | ||
887 | (bio->bi_rw & REQ_RAHEAD) || | ||
888 | (bio->bi_rw & REQ_META) || | ||
889 | s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA) | ||
890 | reada = 0; | ||
891 | else { | ||
892 | reada = min(dc->readahead >> 9, | ||
893 | sectors - bio_sectors(miss)); | ||
894 | |||
895 | if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev)) | ||
896 | reada = bdev_sectors(miss->bi_bdev) - | ||
897 | bio_end_sector(miss); | ||
898 | } | 952 | } |
899 | 953 | ||
900 | s->cache_bio_sectors = bio_sectors(miss) + reada; | 954 | if (!(bio->bi_rw & REQ_RAHEAD) && |
901 | s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, | 955 | !(bio->bi_rw & REQ_META) && |
902 | DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), | 956 | s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) |
903 | dc->disk.bio_split); | 957 | reada = min_t(sector_t, dc->readahead >> 9, |
958 | bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); | ||
904 | 959 | ||
905 | if (!s->op.cache_bio) | 960 | s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); |
906 | goto out_submit; | ||
907 | 961 | ||
908 | s->op.cache_bio->bi_sector = miss->bi_sector; | 962 | s->iop.replace_key = KEY(s->iop.inode, |
909 | s->op.cache_bio->bi_bdev = miss->bi_bdev; | 963 | bio->bi_sector + s->insert_bio_sectors, |
910 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; | 964 | s->insert_bio_sectors); |
911 | 965 | ||
912 | s->op.cache_bio->bi_end_io = request_endio; | 966 | ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key); |
913 | s->op.cache_bio->bi_private = &s->cl; | 967 | if (ret) |
968 | return ret; | ||
969 | |||
970 | s->iop.replace = true; | ||
971 | |||
972 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | ||
914 | 973 | ||
915 | /* btree_search_recurse()'s btree iterator is no good anymore */ | 974 | /* btree_search_recurse()'s btree iterator is no good anymore */ |
916 | ret = -EINTR; | 975 | ret = miss == bio ? MAP_DONE : -EINTR; |
917 | if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) | 976 | |
918 | goto out_put; | 977 | cache_bio = bio_alloc_bioset(GFP_NOWAIT, |
978 | DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS), | ||
979 | dc->disk.bio_split); | ||
980 | if (!cache_bio) | ||
981 | goto out_submit; | ||
982 | |||
983 | cache_bio->bi_sector = miss->bi_sector; | ||
984 | cache_bio->bi_bdev = miss->bi_bdev; | ||
985 | cache_bio->bi_size = s->insert_bio_sectors << 9; | ||
986 | |||
987 | cache_bio->bi_end_io = request_endio; | ||
988 | cache_bio->bi_private = &s->cl; | ||
919 | 989 | ||
920 | bch_bio_map(s->op.cache_bio, NULL); | 990 | bch_bio_map(cache_bio, NULL); |
921 | if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) | 991 | if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) |
922 | goto out_put; | 992 | goto out_put; |
923 | 993 | ||
924 | s->cache_miss = miss; | 994 | if (reada) |
925 | bio_get(s->op.cache_bio); | 995 | bch_mark_cache_readahead(s->iop.c, s->d); |
926 | 996 | ||
927 | closure_bio_submit(s->op.cache_bio, &s->cl, s->d); | 997 | s->cache_miss = miss; |
998 | s->iop.bio = cache_bio; | ||
999 | bio_get(cache_bio); | ||
1000 | closure_bio_submit(cache_bio, &s->cl, s->d); | ||
928 | 1001 | ||
929 | return ret; | 1002 | return ret; |
930 | out_put: | 1003 | out_put: |
931 | bio_put(s->op.cache_bio); | 1004 | bio_put(cache_bio); |
932 | s->op.cache_bio = NULL; | ||
933 | out_submit: | 1005 | out_submit: |
1006 | miss->bi_end_io = request_endio; | ||
1007 | miss->bi_private = &s->cl; | ||
934 | closure_bio_submit(miss, &s->cl, s->d); | 1008 | closure_bio_submit(miss, &s->cl, s->d); |
935 | return ret; | 1009 | return ret; |
936 | } | 1010 | } |
937 | 1011 | ||
938 | static void request_read(struct cached_dev *dc, struct search *s) | 1012 | static void cached_dev_read(struct cached_dev *dc, struct search *s) |
939 | { | 1013 | { |
940 | struct closure *cl = &s->cl; | 1014 | struct closure *cl = &s->cl; |
941 | 1015 | ||
942 | check_should_skip(dc, s); | 1016 | closure_call(&s->iop.cl, cache_lookup, NULL, cl); |
943 | closure_call(&s->op.cl, btree_read_async, NULL, cl); | 1017 | continue_at(cl, cached_dev_read_done_bh, NULL); |
944 | |||
945 | continue_at(cl, request_read_done_bh, NULL); | ||
946 | } | 1018 | } |
947 | 1019 | ||
948 | /* Process writes */ | 1020 | /* Process writes */ |
@@ -956,47 +1028,52 @@ static void cached_dev_write_complete(struct closure *cl) | |||
956 | cached_dev_bio_complete(cl); | 1028 | cached_dev_bio_complete(cl); |
957 | } | 1029 | } |
958 | 1030 | ||
959 | static void request_write(struct cached_dev *dc, struct search *s) | 1031 | static void cached_dev_write(struct cached_dev *dc, struct search *s) |
960 | { | 1032 | { |
961 | struct closure *cl = &s->cl; | 1033 | struct closure *cl = &s->cl; |
962 | struct bio *bio = &s->bio.bio; | 1034 | struct bio *bio = &s->bio.bio; |
963 | struct bkey start, end; | 1035 | struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0); |
964 | start = KEY(dc->disk.id, bio->bi_sector, 0); | 1036 | struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); |
965 | end = KEY(dc->disk.id, bio_end_sector(bio), 0); | ||
966 | 1037 | ||
967 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); | 1038 | bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end); |
968 | 1039 | ||
969 | check_should_skip(dc, s); | ||
970 | down_read_non_owner(&dc->writeback_lock); | 1040 | down_read_non_owner(&dc->writeback_lock); |
971 | |||
972 | if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { | 1041 | if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { |
973 | s->op.skip = false; | 1042 | /* |
974 | s->writeback = true; | 1043 | * We overlap with some dirty data undergoing background |
1044 | * writeback, force this write to writeback | ||
1045 | */ | ||
1046 | s->iop.bypass = false; | ||
1047 | s->iop.writeback = true; | ||
975 | } | 1048 | } |
976 | 1049 | ||
1050 | /* | ||
1051 | * Discards aren't _required_ to do anything, so skipping if | ||
1052 | * check_overlapping returned true is ok | ||
1053 | * | ||
1054 | * But check_overlapping drops dirty keys for which io hasn't started, | ||
1055 | * so we still want to call it. | ||
1056 | */ | ||
977 | if (bio->bi_rw & REQ_DISCARD) | 1057 | if (bio->bi_rw & REQ_DISCARD) |
978 | goto skip; | 1058 | s->iop.bypass = true; |
979 | 1059 | ||
980 | if (should_writeback(dc, s->orig_bio, | 1060 | if (should_writeback(dc, s->orig_bio, |
981 | cache_mode(dc, bio), | 1061 | cache_mode(dc, bio), |
982 | s->op.skip)) { | 1062 | s->iop.bypass)) { |
983 | s->op.skip = false; | 1063 | s->iop.bypass = false; |
984 | s->writeback = true; | 1064 | s->iop.writeback = true; |
985 | } | 1065 | } |
986 | 1066 | ||
987 | if (s->op.skip) | 1067 | if (s->iop.bypass) { |
988 | goto skip; | 1068 | s->iop.bio = s->orig_bio; |
989 | 1069 | bio_get(s->iop.bio); | |
990 | trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); | ||
991 | 1070 | ||
992 | if (!s->writeback) { | 1071 | if (!(bio->bi_rw & REQ_DISCARD) || |
993 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, | 1072 | blk_queue_discard(bdev_get_queue(dc->bdev))) |
994 | dc->disk.bio_split); | 1073 | closure_bio_submit(bio, cl, s->d); |
995 | 1074 | } else if (s->iop.writeback) { | |
996 | closure_bio_submit(bio, cl, s->d); | ||
997 | } else { | ||
998 | bch_writeback_add(dc); | 1075 | bch_writeback_add(dc); |
999 | s->op.cache_bio = bio; | 1076 | s->iop.bio = bio; |
1000 | 1077 | ||
1001 | if (bio->bi_rw & REQ_FLUSH) { | 1078 | if (bio->bi_rw & REQ_FLUSH) { |
1002 | /* Also need to send a flush to the backing device */ | 1079 | /* Also need to send a flush to the backing device */ |
@@ -1010,36 +1087,26 @@ static void request_write(struct cached_dev *dc, struct search *s) | |||
1010 | 1087 | ||
1011 | closure_bio_submit(flush, cl, s->d); | 1088 | closure_bio_submit(flush, cl, s->d); |
1012 | } | 1089 | } |
1013 | } | 1090 | } else { |
1014 | out: | 1091 | s->iop.bio = bio_clone_bioset(bio, GFP_NOIO, |
1015 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | 1092 | dc->disk.bio_split); |
1016 | continue_at(cl, cached_dev_write_complete, NULL); | ||
1017 | skip: | ||
1018 | s->op.skip = true; | ||
1019 | s->op.cache_bio = s->orig_bio; | ||
1020 | bio_get(s->op.cache_bio); | ||
1021 | 1093 | ||
1022 | if ((bio->bi_rw & REQ_DISCARD) && | 1094 | closure_bio_submit(bio, cl, s->d); |
1023 | !blk_queue_discard(bdev_get_queue(dc->bdev))) | 1095 | } |
1024 | goto out; | ||
1025 | 1096 | ||
1026 | closure_bio_submit(bio, cl, s->d); | 1097 | closure_call(&s->iop.cl, bch_data_insert, NULL, cl); |
1027 | goto out; | 1098 | continue_at(cl, cached_dev_write_complete, NULL); |
1028 | } | 1099 | } |
1029 | 1100 | ||
1030 | static void request_nodata(struct cached_dev *dc, struct search *s) | 1101 | static void cached_dev_nodata(struct closure *cl) |
1031 | { | 1102 | { |
1032 | struct closure *cl = &s->cl; | 1103 | struct search *s = container_of(cl, struct search, cl); |
1033 | struct bio *bio = &s->bio.bio; | 1104 | struct bio *bio = &s->bio.bio; |
1034 | 1105 | ||
1035 | if (bio->bi_rw & REQ_DISCARD) { | 1106 | if (s->iop.flush_journal) |
1036 | request_write(dc, s); | 1107 | bch_journal_meta(s->iop.c, cl); |
1037 | return; | ||
1038 | } | ||
1039 | |||
1040 | if (s->op.flush_journal) | ||
1041 | bch_journal_meta(s->op.c, cl); | ||
1042 | 1108 | ||
1109 | /* If it's a flush, we send the flush to the backing device too */ | ||
1043 | closure_bio_submit(bio, cl, s->d); | 1110 | closure_bio_submit(bio, cl, s->d); |
1044 | 1111 | ||
1045 | continue_at(cl, cached_dev_bio_complete, NULL); | 1112 | continue_at(cl, cached_dev_bio_complete, NULL); |
@@ -1047,134 +1114,6 @@ static void request_nodata(struct cached_dev *dc, struct search *s) | |||
1047 | 1114 | ||
1048 | /* Cached devices - read & write stuff */ | 1115 | /* Cached devices - read & write stuff */ |
1049 | 1116 | ||
1050 | unsigned bch_get_congested(struct cache_set *c) | ||
1051 | { | ||
1052 | int i; | ||
1053 | long rand; | ||
1054 | |||
1055 | if (!c->congested_read_threshold_us && | ||
1056 | !c->congested_write_threshold_us) | ||
1057 | return 0; | ||
1058 | |||
1059 | i = (local_clock_us() - c->congested_last_us) / 1024; | ||
1060 | if (i < 0) | ||
1061 | return 0; | ||
1062 | |||
1063 | i += atomic_read(&c->congested); | ||
1064 | if (i >= 0) | ||
1065 | return 0; | ||
1066 | |||
1067 | i += CONGESTED_MAX; | ||
1068 | |||
1069 | if (i > 0) | ||
1070 | i = fract_exp_two(i, 6); | ||
1071 | |||
1072 | rand = get_random_int(); | ||
1073 | i -= bitmap_weight(&rand, BITS_PER_LONG); | ||
1074 | |||
1075 | return i > 0 ? i : 1; | ||
1076 | } | ||
1077 | |||
1078 | static void add_sequential(struct task_struct *t) | ||
1079 | { | ||
1080 | ewma_add(t->sequential_io_avg, | ||
1081 | t->sequential_io, 8, 0); | ||
1082 | |||
1083 | t->sequential_io = 0; | ||
1084 | } | ||
1085 | |||
1086 | static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) | ||
1087 | { | ||
1088 | return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; | ||
1089 | } | ||
1090 | |||
1091 | static void check_should_skip(struct cached_dev *dc, struct search *s) | ||
1092 | { | ||
1093 | struct cache_set *c = s->op.c; | ||
1094 | struct bio *bio = &s->bio.bio; | ||
1095 | unsigned mode = cache_mode(dc, bio); | ||
1096 | unsigned sectors, congested = bch_get_congested(c); | ||
1097 | |||
1098 | if (atomic_read(&dc->disk.detaching) || | ||
1099 | c->gc_stats.in_use > CUTOFF_CACHE_ADD || | ||
1100 | (bio->bi_rw & REQ_DISCARD)) | ||
1101 | goto skip; | ||
1102 | |||
1103 | if (mode == CACHE_MODE_NONE || | ||
1104 | (mode == CACHE_MODE_WRITEAROUND && | ||
1105 | (bio->bi_rw & REQ_WRITE))) | ||
1106 | goto skip; | ||
1107 | |||
1108 | if (bio->bi_sector & (c->sb.block_size - 1) || | ||
1109 | bio_sectors(bio) & (c->sb.block_size - 1)) { | ||
1110 | pr_debug("skipping unaligned io"); | ||
1111 | goto skip; | ||
1112 | } | ||
1113 | |||
1114 | if (!congested && !dc->sequential_cutoff) | ||
1115 | goto rescale; | ||
1116 | |||
1117 | if (!congested && | ||
1118 | mode == CACHE_MODE_WRITEBACK && | ||
1119 | (bio->bi_rw & REQ_WRITE) && | ||
1120 | (bio->bi_rw & REQ_SYNC)) | ||
1121 | goto rescale; | ||
1122 | |||
1123 | if (dc->sequential_merge) { | ||
1124 | struct io *i; | ||
1125 | |||
1126 | spin_lock(&dc->io_lock); | ||
1127 | |||
1128 | hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) | ||
1129 | if (i->last == bio->bi_sector && | ||
1130 | time_before(jiffies, i->jiffies)) | ||
1131 | goto found; | ||
1132 | |||
1133 | i = list_first_entry(&dc->io_lru, struct io, lru); | ||
1134 | |||
1135 | add_sequential(s->task); | ||
1136 | i->sequential = 0; | ||
1137 | found: | ||
1138 | if (i->sequential + bio->bi_size > i->sequential) | ||
1139 | i->sequential += bio->bi_size; | ||
1140 | |||
1141 | i->last = bio_end_sector(bio); | ||
1142 | i->jiffies = jiffies + msecs_to_jiffies(5000); | ||
1143 | s->task->sequential_io = i->sequential; | ||
1144 | |||
1145 | hlist_del(&i->hash); | ||
1146 | hlist_add_head(&i->hash, iohash(dc, i->last)); | ||
1147 | list_move_tail(&i->lru, &dc->io_lru); | ||
1148 | |||
1149 | spin_unlock(&dc->io_lock); | ||
1150 | } else { | ||
1151 | s->task->sequential_io = bio->bi_size; | ||
1152 | |||
1153 | add_sequential(s->task); | ||
1154 | } | ||
1155 | |||
1156 | sectors = max(s->task->sequential_io, | ||
1157 | s->task->sequential_io_avg) >> 9; | ||
1158 | |||
1159 | if (dc->sequential_cutoff && | ||
1160 | sectors >= dc->sequential_cutoff >> 9) { | ||
1161 | trace_bcache_bypass_sequential(s->orig_bio); | ||
1162 | goto skip; | ||
1163 | } | ||
1164 | |||
1165 | if (congested && sectors >= congested) { | ||
1166 | trace_bcache_bypass_congested(s->orig_bio); | ||
1167 | goto skip; | ||
1168 | } | ||
1169 | |||
1170 | rescale: | ||
1171 | bch_rescale_priorities(c, bio_sectors(bio)); | ||
1172 | return; | ||
1173 | skip: | ||
1174 | bch_mark_sectors_bypassed(s, bio_sectors(bio)); | ||
1175 | s->op.skip = true; | ||
1176 | } | ||
1177 | |||
1178 | static void cached_dev_make_request(struct request_queue *q, struct bio *bio) | 1117 | static void cached_dev_make_request(struct request_queue *q, struct bio *bio) |
1179 | { | 1118 | { |
1180 | struct search *s; | 1119 | struct search *s; |
@@ -1192,14 +1131,24 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio) | |||
1192 | 1131 | ||
1193 | if (cached_dev_get(dc)) { | 1132 | if (cached_dev_get(dc)) { |
1194 | s = search_alloc(bio, d); | 1133 | s = search_alloc(bio, d); |
1195 | trace_bcache_request_start(s, bio); | 1134 | trace_bcache_request_start(s->d, bio); |
1196 | 1135 | ||
1197 | if (!bio_has_data(bio)) | 1136 | if (!bio->bi_size) { |
1198 | request_nodata(dc, s); | 1137 | /* |
1199 | else if (rw) | 1138 | * can't call bch_journal_meta from under |
1200 | request_write(dc, s); | 1139 | * generic_make_request |
1201 | else | 1140 | */ |
1202 | request_read(dc, s); | 1141 | continue_at_nobarrier(&s->cl, |
1142 | cached_dev_nodata, | ||
1143 | bcache_wq); | ||
1144 | } else { | ||
1145 | s->iop.bypass = check_should_bypass(dc, bio); | ||
1146 | |||
1147 | if (rw) | ||
1148 | cached_dev_write(dc, s); | ||
1149 | else | ||
1150 | cached_dev_read(dc, s); | ||
1151 | } | ||
1203 | } else { | 1152 | } else { |
1204 | if ((bio->bi_rw & REQ_DISCARD) && | 1153 | if ((bio->bi_rw & REQ_DISCARD) && |
1205 | !blk_queue_discard(bdev_get_queue(dc->bdev))) | 1154 | !blk_queue_discard(bdev_get_queue(dc->bdev))) |
@@ -1274,9 +1223,19 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s, | |||
1274 | bio_advance(bio, min(sectors << 9, bio->bi_size)); | 1223 | bio_advance(bio, min(sectors << 9, bio->bi_size)); |
1275 | 1224 | ||
1276 | if (!bio->bi_size) | 1225 | if (!bio->bi_size) |
1277 | s->op.lookup_done = true; | 1226 | return MAP_DONE; |
1278 | 1227 | ||
1279 | return 0; | 1228 | return MAP_CONTINUE; |
1229 | } | ||
1230 | |||
1231 | static void flash_dev_nodata(struct closure *cl) | ||
1232 | { | ||
1233 | struct search *s = container_of(cl, struct search, cl); | ||
1234 | |||
1235 | if (s->iop.flush_journal) | ||
1236 | bch_journal_meta(s->iop.c, cl); | ||
1237 | |||
1238 | continue_at(cl, search_free, NULL); | ||
1280 | } | 1239 | } |
1281 | 1240 | ||
1282 | static void flash_dev_make_request(struct request_queue *q, struct bio *bio) | 1241 | static void flash_dev_make_request(struct request_queue *q, struct bio *bio) |
@@ -1295,23 +1254,28 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) | |||
1295 | cl = &s->cl; | 1254 | cl = &s->cl; |
1296 | bio = &s->bio.bio; | 1255 | bio = &s->bio.bio; |
1297 | 1256 | ||
1298 | trace_bcache_request_start(s, bio); | 1257 | trace_bcache_request_start(s->d, bio); |
1299 | 1258 | ||
1300 | if (bio_has_data(bio) && !rw) { | 1259 | if (!bio->bi_size) { |
1301 | closure_call(&s->op.cl, btree_read_async, NULL, cl); | 1260 | /* |
1302 | } else if (bio_has_data(bio) || s->op.skip) { | 1261 | * can't call bch_journal_meta from under |
1303 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, | 1262 | * generic_make_request |
1263 | */ | ||
1264 | continue_at_nobarrier(&s->cl, | ||
1265 | flash_dev_nodata, | ||
1266 | bcache_wq); | ||
1267 | } else if (rw) { | ||
1268 | bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, | ||
1304 | &KEY(d->id, bio->bi_sector, 0), | 1269 | &KEY(d->id, bio->bi_sector, 0), |
1305 | &KEY(d->id, bio_end_sector(bio), 0)); | 1270 | &KEY(d->id, bio_end_sector(bio), 0)); |
1306 | 1271 | ||
1307 | s->writeback = true; | 1272 | s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0; |
1308 | s->op.cache_bio = bio; | 1273 | s->iop.writeback = true; |
1274 | s->iop.bio = bio; | ||
1309 | 1275 | ||
1310 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | 1276 | closure_call(&s->iop.cl, bch_data_insert, NULL, cl); |
1311 | } else { | 1277 | } else { |
1312 | /* No data - probably a cache flush */ | 1278 | closure_call(&s->iop.cl, cache_lookup, NULL, cl); |
1313 | if (s->op.flush_journal) | ||
1314 | bch_journal_meta(s->op.c, cl); | ||
1315 | } | 1279 | } |
1316 | 1280 | ||
1317 | continue_at(cl, search_free, NULL); | 1281 | continue_at(cl, search_free, NULL); |
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 57dc4784f4f4..2cd65bf073c2 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h | |||
@@ -3,40 +3,33 @@ | |||
3 | 3 | ||
4 | #include <linux/cgroup.h> | 4 | #include <linux/cgroup.h> |
5 | 5 | ||
6 | struct search { | 6 | struct data_insert_op { |
7 | /* Stack frame for bio_complete */ | ||
8 | struct closure cl; | 7 | struct closure cl; |
8 | struct cache_set *c; | ||
9 | struct bio *bio; | ||
9 | 10 | ||
10 | struct bcache_device *d; | 11 | unsigned inode; |
11 | struct task_struct *task; | 12 | uint16_t write_point; |
12 | 13 | uint16_t write_prio; | |
13 | struct bbio bio; | 14 | short error; |
14 | struct bio *orig_bio; | ||
15 | struct bio *cache_miss; | ||
16 | unsigned cache_bio_sectors; | ||
17 | |||
18 | unsigned recoverable:1; | ||
19 | unsigned unaligned_bvec:1; | ||
20 | 15 | ||
21 | unsigned write:1; | 16 | unsigned bypass:1; |
22 | unsigned writeback:1; | 17 | unsigned writeback:1; |
18 | unsigned flush_journal:1; | ||
19 | unsigned csum:1; | ||
23 | 20 | ||
24 | /* IO error returned to s->bio */ | 21 | unsigned replace:1; |
25 | short error; | 22 | unsigned replace_collision:1; |
26 | unsigned long start_time; | 23 | |
24 | unsigned insert_data_done:1; | ||
27 | 25 | ||
28 | /* Anything past op->keys won't get zeroed in do_bio_hook */ | 26 | /* Anything past this point won't get zeroed in search_alloc() */ |
29 | struct btree_op op; | 27 | struct keylist insert_keys; |
28 | BKEY_PADDED(replace_key); | ||
30 | }; | 29 | }; |
31 | 30 | ||
32 | void bch_cache_read_endio(struct bio *, int); | ||
33 | unsigned bch_get_congested(struct cache_set *); | 31 | unsigned bch_get_congested(struct cache_set *); |
34 | void bch_insert_data(struct closure *cl); | 32 | void bch_data_insert(struct closure *cl); |
35 | void bch_btree_insert_async(struct closure *); | ||
36 | void bch_cache_read_endio(struct bio *, int); | ||
37 | |||
38 | void bch_open_buckets_free(struct cache_set *); | ||
39 | int bch_open_buckets_alloc(struct cache_set *); | ||
40 | 33 | ||
41 | void bch_cached_dev_request_init(struct cached_dev *dc); | 34 | void bch_cached_dev_request_init(struct cached_dev *dc); |
42 | void bch_flash_dev_request_init(struct bcache_device *d); | 35 | void bch_flash_dev_request_init(struct bcache_device *d); |
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index b8730e714d69..84d0782f702e 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c | |||
@@ -7,7 +7,6 @@ | |||
7 | #include "bcache.h" | 7 | #include "bcache.h" |
8 | #include "stats.h" | 8 | #include "stats.h" |
9 | #include "btree.h" | 9 | #include "btree.h" |
10 | #include "request.h" | ||
11 | #include "sysfs.h" | 10 | #include "sysfs.h" |
12 | 11 | ||
13 | /* | 12 | /* |
@@ -196,35 +195,36 @@ static void mark_cache_stats(struct cache_stat_collector *stats, | |||
196 | atomic_inc(&stats->cache_bypass_misses); | 195 | atomic_inc(&stats->cache_bypass_misses); |
197 | } | 196 | } |
198 | 197 | ||
199 | void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass) | 198 | void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, |
199 | bool hit, bool bypass) | ||
200 | { | 200 | { |
201 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 201 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); |
202 | mark_cache_stats(&dc->accounting.collector, hit, bypass); | 202 | mark_cache_stats(&dc->accounting.collector, hit, bypass); |
203 | mark_cache_stats(&s->op.c->accounting.collector, hit, bypass); | 203 | mark_cache_stats(&c->accounting.collector, hit, bypass); |
204 | #ifdef CONFIG_CGROUP_BCACHE | 204 | #ifdef CONFIG_CGROUP_BCACHE |
205 | mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); | 205 | mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass); |
206 | #endif | 206 | #endif |
207 | } | 207 | } |
208 | 208 | ||
209 | void bch_mark_cache_readahead(struct search *s) | 209 | void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) |
210 | { | 210 | { |
211 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 211 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); |
212 | atomic_inc(&dc->accounting.collector.cache_readaheads); | 212 | atomic_inc(&dc->accounting.collector.cache_readaheads); |
213 | atomic_inc(&s->op.c->accounting.collector.cache_readaheads); | 213 | atomic_inc(&c->accounting.collector.cache_readaheads); |
214 | } | 214 | } |
215 | 215 | ||
216 | void bch_mark_cache_miss_collision(struct search *s) | 216 | void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d) |
217 | { | 217 | { |
218 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 218 | struct cached_dev *dc = container_of(d, struct cached_dev, disk); |
219 | atomic_inc(&dc->accounting.collector.cache_miss_collisions); | 219 | atomic_inc(&dc->accounting.collector.cache_miss_collisions); |
220 | atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions); | 220 | atomic_inc(&c->accounting.collector.cache_miss_collisions); |
221 | } | 221 | } |
222 | 222 | ||
223 | void bch_mark_sectors_bypassed(struct search *s, int sectors) | 223 | void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc, |
224 | int sectors) | ||
224 | { | 225 | { |
225 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | ||
226 | atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); | 226 | atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); |
227 | atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed); | 227 | atomic_add(sectors, &c->accounting.collector.sectors_bypassed); |
228 | } | 228 | } |
229 | 229 | ||
230 | void bch_cache_accounting_init(struct cache_accounting *acc, | 230 | void bch_cache_accounting_init(struct cache_accounting *acc, |
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index c7c7a8fd29fe..adbff141c887 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h | |||
@@ -38,7 +38,9 @@ struct cache_accounting { | |||
38 | struct cache_stats day; | 38 | struct cache_stats day; |
39 | }; | 39 | }; |
40 | 40 | ||
41 | struct search; | 41 | struct cache_set; |
42 | struct cached_dev; | ||
43 | struct bcache_device; | ||
42 | 44 | ||
43 | void bch_cache_accounting_init(struct cache_accounting *acc, | 45 | void bch_cache_accounting_init(struct cache_accounting *acc, |
44 | struct closure *parent); | 46 | struct closure *parent); |
@@ -50,9 +52,10 @@ void bch_cache_accounting_clear(struct cache_accounting *acc); | |||
50 | 52 | ||
51 | void bch_cache_accounting_destroy(struct cache_accounting *acc); | 53 | void bch_cache_accounting_destroy(struct cache_accounting *acc); |
52 | 54 | ||
53 | void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass); | 55 | void bch_mark_cache_accounting(struct cache_set *, struct bcache_device *, |
54 | void bch_mark_cache_readahead(struct search *s); | 56 | bool, bool); |
55 | void bch_mark_cache_miss_collision(struct search *s); | 57 | void bch_mark_cache_readahead(struct cache_set *, struct bcache_device *); |
56 | void bch_mark_sectors_bypassed(struct search *s, int sectors); | 58 | void bch_mark_cache_miss_collision(struct cache_set *, struct bcache_device *); |
59 | void bch_mark_sectors_bypassed(struct cache_set *, struct cached_dev *, int); | ||
57 | 60 | ||
58 | #endif /* _BCACHE_STATS_H_ */ | 61 | #endif /* _BCACHE_STATS_H_ */ |
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 547c4c57b052..dec15cd2d797 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/buffer_head.h> | 16 | #include <linux/buffer_head.h> |
17 | #include <linux/debugfs.h> | 17 | #include <linux/debugfs.h> |
18 | #include <linux/genhd.h> | 18 | #include <linux/genhd.h> |
19 | #include <linux/idr.h> | ||
19 | #include <linux/kthread.h> | 20 | #include <linux/kthread.h> |
20 | #include <linux/module.h> | 21 | #include <linux/module.h> |
21 | #include <linux/random.h> | 22 | #include <linux/random.h> |
@@ -45,21 +46,13 @@ const char * const bch_cache_modes[] = { | |||
45 | NULL | 46 | NULL |
46 | }; | 47 | }; |
47 | 48 | ||
48 | struct uuid_entry_v0 { | ||
49 | uint8_t uuid[16]; | ||
50 | uint8_t label[32]; | ||
51 | uint32_t first_reg; | ||
52 | uint32_t last_reg; | ||
53 | uint32_t invalidated; | ||
54 | uint32_t pad; | ||
55 | }; | ||
56 | |||
57 | static struct kobject *bcache_kobj; | 49 | static struct kobject *bcache_kobj; |
58 | struct mutex bch_register_lock; | 50 | struct mutex bch_register_lock; |
59 | LIST_HEAD(bch_cache_sets); | 51 | LIST_HEAD(bch_cache_sets); |
60 | static LIST_HEAD(uncached_devices); | 52 | static LIST_HEAD(uncached_devices); |
61 | 53 | ||
62 | static int bcache_major, bcache_minor; | 54 | static int bcache_major; |
55 | static DEFINE_IDA(bcache_minor); | ||
63 | static wait_queue_head_t unregister_wait; | 56 | static wait_queue_head_t unregister_wait; |
64 | struct workqueue_struct *bcache_wq; | 57 | struct workqueue_struct *bcache_wq; |
65 | 58 | ||
@@ -382,7 +375,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) | |||
382 | { | 375 | { |
383 | struct bkey *k = &j->uuid_bucket; | 376 | struct bkey *k = &j->uuid_bucket; |
384 | 377 | ||
385 | if (__bch_ptr_invalid(c, 1, k)) | 378 | if (bch_btree_ptr_invalid(c, k)) |
386 | return "bad uuid pointer"; | 379 | return "bad uuid pointer"; |
387 | 380 | ||
388 | bkey_copy(&c->uuid_bucket, k); | 381 | bkey_copy(&c->uuid_bucket, k); |
@@ -427,7 +420,7 @@ static int __uuid_write(struct cache_set *c) | |||
427 | 420 | ||
428 | lockdep_assert_held(&bch_register_lock); | 421 | lockdep_assert_held(&bch_register_lock); |
429 | 422 | ||
430 | if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl)) | 423 | if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) |
431 | return 1; | 424 | return 1; |
432 | 425 | ||
433 | SET_KEY_SIZE(&k.key, c->sb.bucket_size); | 426 | SET_KEY_SIZE(&k.key, c->sb.bucket_size); |
@@ -435,7 +428,7 @@ static int __uuid_write(struct cache_set *c) | |||
435 | closure_sync(&cl); | 428 | closure_sync(&cl); |
436 | 429 | ||
437 | bkey_copy(&c->uuid_bucket, &k.key); | 430 | bkey_copy(&c->uuid_bucket, &k.key); |
438 | __bkey_put(c, &k.key); | 431 | bkey_put(c, &k.key); |
439 | return 0; | 432 | return 0; |
440 | } | 433 | } |
441 | 434 | ||
@@ -562,10 +555,10 @@ void bch_prio_write(struct cache *ca) | |||
562 | } | 555 | } |
563 | 556 | ||
564 | p->next_bucket = ca->prio_buckets[i + 1]; | 557 | p->next_bucket = ca->prio_buckets[i + 1]; |
565 | p->magic = pset_magic(ca); | 558 | p->magic = pset_magic(&ca->sb); |
566 | p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); | 559 | p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); |
567 | 560 | ||
568 | bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl); | 561 | bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); |
569 | BUG_ON(bucket == -1); | 562 | BUG_ON(bucket == -1); |
570 | 563 | ||
571 | mutex_unlock(&ca->set->bucket_lock); | 564 | mutex_unlock(&ca->set->bucket_lock); |
@@ -613,7 +606,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) | |||
613 | if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) | 606 | if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) |
614 | pr_warn("bad csum reading priorities"); | 607 | pr_warn("bad csum reading priorities"); |
615 | 608 | ||
616 | if (p->magic != pset_magic(ca)) | 609 | if (p->magic != pset_magic(&ca->sb)) |
617 | pr_warn("bad magic reading priorities"); | 610 | pr_warn("bad magic reading priorities"); |
618 | 611 | ||
619 | bucket = p->next_bucket; | 612 | bucket = p->next_bucket; |
@@ -630,7 +623,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) | |||
630 | static int open_dev(struct block_device *b, fmode_t mode) | 623 | static int open_dev(struct block_device *b, fmode_t mode) |
631 | { | 624 | { |
632 | struct bcache_device *d = b->bd_disk->private_data; | 625 | struct bcache_device *d = b->bd_disk->private_data; |
633 | if (atomic_read(&d->closing)) | 626 | if (test_bit(BCACHE_DEV_CLOSING, &d->flags)) |
634 | return -ENXIO; | 627 | return -ENXIO; |
635 | 628 | ||
636 | closure_get(&d->cl); | 629 | closure_get(&d->cl); |
@@ -659,20 +652,24 @@ static const struct block_device_operations bcache_ops = { | |||
659 | 652 | ||
660 | void bcache_device_stop(struct bcache_device *d) | 653 | void bcache_device_stop(struct bcache_device *d) |
661 | { | 654 | { |
662 | if (!atomic_xchg(&d->closing, 1)) | 655 | if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags)) |
663 | closure_queue(&d->cl); | 656 | closure_queue(&d->cl); |
664 | } | 657 | } |
665 | 658 | ||
666 | static void bcache_device_unlink(struct bcache_device *d) | 659 | static void bcache_device_unlink(struct bcache_device *d) |
667 | { | 660 | { |
668 | unsigned i; | 661 | lockdep_assert_held(&bch_register_lock); |
669 | struct cache *ca; | ||
670 | 662 | ||
671 | sysfs_remove_link(&d->c->kobj, d->name); | 663 | if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) { |
672 | sysfs_remove_link(&d->kobj, "cache"); | 664 | unsigned i; |
665 | struct cache *ca; | ||
673 | 666 | ||
674 | for_each_cache(ca, d->c, i) | 667 | sysfs_remove_link(&d->c->kobj, d->name); |
675 | bd_unlink_disk_holder(ca->bdev, d->disk); | 668 | sysfs_remove_link(&d->kobj, "cache"); |
669 | |||
670 | for_each_cache(ca, d->c, i) | ||
671 | bd_unlink_disk_holder(ca->bdev, d->disk); | ||
672 | } | ||
676 | } | 673 | } |
677 | 674 | ||
678 | static void bcache_device_link(struct bcache_device *d, struct cache_set *c, | 675 | static void bcache_device_link(struct bcache_device *d, struct cache_set *c, |
@@ -696,19 +693,16 @@ static void bcache_device_detach(struct bcache_device *d) | |||
696 | { | 693 | { |
697 | lockdep_assert_held(&bch_register_lock); | 694 | lockdep_assert_held(&bch_register_lock); |
698 | 695 | ||
699 | if (atomic_read(&d->detaching)) { | 696 | if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { |
700 | struct uuid_entry *u = d->c->uuids + d->id; | 697 | struct uuid_entry *u = d->c->uuids + d->id; |
701 | 698 | ||
702 | SET_UUID_FLASH_ONLY(u, 0); | 699 | SET_UUID_FLASH_ONLY(u, 0); |
703 | memcpy(u->uuid, invalid_uuid, 16); | 700 | memcpy(u->uuid, invalid_uuid, 16); |
704 | u->invalidated = cpu_to_le32(get_seconds()); | 701 | u->invalidated = cpu_to_le32(get_seconds()); |
705 | bch_uuid_write(d->c); | 702 | bch_uuid_write(d->c); |
706 | |||
707 | atomic_set(&d->detaching, 0); | ||
708 | } | 703 | } |
709 | 704 | ||
710 | if (!d->flush_done) | 705 | bcache_device_unlink(d); |
711 | bcache_device_unlink(d); | ||
712 | 706 | ||
713 | d->c->devices[d->id] = NULL; | 707 | d->c->devices[d->id] = NULL; |
714 | closure_put(&d->c->caching); | 708 | closure_put(&d->c->caching); |
@@ -739,14 +733,20 @@ static void bcache_device_free(struct bcache_device *d) | |||
739 | del_gendisk(d->disk); | 733 | del_gendisk(d->disk); |
740 | if (d->disk && d->disk->queue) | 734 | if (d->disk && d->disk->queue) |
741 | blk_cleanup_queue(d->disk->queue); | 735 | blk_cleanup_queue(d->disk->queue); |
742 | if (d->disk) | 736 | if (d->disk) { |
737 | ida_simple_remove(&bcache_minor, d->disk->first_minor); | ||
743 | put_disk(d->disk); | 738 | put_disk(d->disk); |
739 | } | ||
744 | 740 | ||
745 | bio_split_pool_free(&d->bio_split_hook); | 741 | bio_split_pool_free(&d->bio_split_hook); |
746 | if (d->unaligned_bvec) | 742 | if (d->unaligned_bvec) |
747 | mempool_destroy(d->unaligned_bvec); | 743 | mempool_destroy(d->unaligned_bvec); |
748 | if (d->bio_split) | 744 | if (d->bio_split) |
749 | bioset_free(d->bio_split); | 745 | bioset_free(d->bio_split); |
746 | if (is_vmalloc_addr(d->full_dirty_stripes)) | ||
747 | vfree(d->full_dirty_stripes); | ||
748 | else | ||
749 | kfree(d->full_dirty_stripes); | ||
750 | if (is_vmalloc_addr(d->stripe_sectors_dirty)) | 750 | if (is_vmalloc_addr(d->stripe_sectors_dirty)) |
751 | vfree(d->stripe_sectors_dirty); | 751 | vfree(d->stripe_sectors_dirty); |
752 | else | 752 | else |
@@ -760,15 +760,19 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, | |||
760 | { | 760 | { |
761 | struct request_queue *q; | 761 | struct request_queue *q; |
762 | size_t n; | 762 | size_t n; |
763 | int minor; | ||
763 | 764 | ||
764 | if (!d->stripe_size_bits) | 765 | if (!d->stripe_size) |
765 | d->stripe_size_bits = 31; | 766 | d->stripe_size = 1 << 31; |
766 | 767 | ||
767 | d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >> | 768 | d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); |
768 | d->stripe_size_bits; | ||
769 | 769 | ||
770 | if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) | 770 | if (!d->nr_stripes || |
771 | d->nr_stripes > INT_MAX || | ||
772 | d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) { | ||
773 | pr_err("nr_stripes too large"); | ||
771 | return -ENOMEM; | 774 | return -ENOMEM; |
775 | } | ||
772 | 776 | ||
773 | n = d->nr_stripes * sizeof(atomic_t); | 777 | n = d->nr_stripes * sizeof(atomic_t); |
774 | d->stripe_sectors_dirty = n < PAGE_SIZE << 6 | 778 | d->stripe_sectors_dirty = n < PAGE_SIZE << 6 |
@@ -777,22 +781,38 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, | |||
777 | if (!d->stripe_sectors_dirty) | 781 | if (!d->stripe_sectors_dirty) |
778 | return -ENOMEM; | 782 | return -ENOMEM; |
779 | 783 | ||
784 | n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); | ||
785 | d->full_dirty_stripes = n < PAGE_SIZE << 6 | ||
786 | ? kzalloc(n, GFP_KERNEL) | ||
787 | : vzalloc(n); | ||
788 | if (!d->full_dirty_stripes) | ||
789 | return -ENOMEM; | ||
790 | |||
791 | minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL); | ||
792 | if (minor < 0) | ||
793 | return minor; | ||
794 | |||
780 | if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || | 795 | if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || |
781 | !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, | 796 | !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, |
782 | sizeof(struct bio_vec) * BIO_MAX_PAGES)) || | 797 | sizeof(struct bio_vec) * BIO_MAX_PAGES)) || |
783 | bio_split_pool_init(&d->bio_split_hook) || | 798 | bio_split_pool_init(&d->bio_split_hook) || |
784 | !(d->disk = alloc_disk(1)) || | 799 | !(d->disk = alloc_disk(1))) { |
785 | !(q = blk_alloc_queue(GFP_KERNEL))) | 800 | ida_simple_remove(&bcache_minor, minor); |
786 | return -ENOMEM; | 801 | return -ENOMEM; |
802 | } | ||
787 | 803 | ||
788 | set_capacity(d->disk, sectors); | 804 | set_capacity(d->disk, sectors); |
789 | snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); | 805 | snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor); |
790 | 806 | ||
791 | d->disk->major = bcache_major; | 807 | d->disk->major = bcache_major; |
792 | d->disk->first_minor = bcache_minor++; | 808 | d->disk->first_minor = minor; |
793 | d->disk->fops = &bcache_ops; | 809 | d->disk->fops = &bcache_ops; |
794 | d->disk->private_data = d; | 810 | d->disk->private_data = d; |
795 | 811 | ||
812 | q = blk_alloc_queue(GFP_KERNEL); | ||
813 | if (!q) | ||
814 | return -ENOMEM; | ||
815 | |||
796 | blk_queue_make_request(q, NULL); | 816 | blk_queue_make_request(q, NULL); |
797 | d->disk->queue = q; | 817 | d->disk->queue = q; |
798 | q->queuedata = d; | 818 | q->queuedata = d; |
@@ -874,7 +894,7 @@ static void cached_dev_detach_finish(struct work_struct *w) | |||
874 | struct closure cl; | 894 | struct closure cl; |
875 | closure_init_stack(&cl); | 895 | closure_init_stack(&cl); |
876 | 896 | ||
877 | BUG_ON(!atomic_read(&dc->disk.detaching)); | 897 | BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); |
878 | BUG_ON(atomic_read(&dc->count)); | 898 | BUG_ON(atomic_read(&dc->count)); |
879 | 899 | ||
880 | mutex_lock(&bch_register_lock); | 900 | mutex_lock(&bch_register_lock); |
@@ -888,6 +908,8 @@ static void cached_dev_detach_finish(struct work_struct *w) | |||
888 | bcache_device_detach(&dc->disk); | 908 | bcache_device_detach(&dc->disk); |
889 | list_move(&dc->list, &uncached_devices); | 909 | list_move(&dc->list, &uncached_devices); |
890 | 910 | ||
911 | clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags); | ||
912 | |||
891 | mutex_unlock(&bch_register_lock); | 913 | mutex_unlock(&bch_register_lock); |
892 | 914 | ||
893 | pr_info("Caching disabled for %s", bdevname(dc->bdev, buf)); | 915 | pr_info("Caching disabled for %s", bdevname(dc->bdev, buf)); |
@@ -900,10 +922,10 @@ void bch_cached_dev_detach(struct cached_dev *dc) | |||
900 | { | 922 | { |
901 | lockdep_assert_held(&bch_register_lock); | 923 | lockdep_assert_held(&bch_register_lock); |
902 | 924 | ||
903 | if (atomic_read(&dc->disk.closing)) | 925 | if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) |
904 | return; | 926 | return; |
905 | 927 | ||
906 | if (atomic_xchg(&dc->disk.detaching, 1)) | 928 | if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) |
907 | return; | 929 | return; |
908 | 930 | ||
909 | /* | 931 | /* |
@@ -1030,6 +1052,7 @@ static void cached_dev_free(struct closure *cl) | |||
1030 | struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); | 1052 | struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); |
1031 | 1053 | ||
1032 | cancel_delayed_work_sync(&dc->writeback_rate_update); | 1054 | cancel_delayed_work_sync(&dc->writeback_rate_update); |
1055 | kthread_stop(dc->writeback_thread); | ||
1033 | 1056 | ||
1034 | mutex_lock(&bch_register_lock); | 1057 | mutex_lock(&bch_register_lock); |
1035 | 1058 | ||
@@ -1058,11 +1081,7 @@ static void cached_dev_flush(struct closure *cl) | |||
1058 | struct bcache_device *d = &dc->disk; | 1081 | struct bcache_device *d = &dc->disk; |
1059 | 1082 | ||
1060 | mutex_lock(&bch_register_lock); | 1083 | mutex_lock(&bch_register_lock); |
1061 | d->flush_done = 1; | 1084 | bcache_device_unlink(d); |
1062 | |||
1063 | if (d->c) | ||
1064 | bcache_device_unlink(d); | ||
1065 | |||
1066 | mutex_unlock(&bch_register_lock); | 1085 | mutex_unlock(&bch_register_lock); |
1067 | 1086 | ||
1068 | bch_cache_accounting_destroy(&dc->accounting); | 1087 | bch_cache_accounting_destroy(&dc->accounting); |
@@ -1088,7 +1107,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) | |||
1088 | spin_lock_init(&dc->io_lock); | 1107 | spin_lock_init(&dc->io_lock); |
1089 | bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); | 1108 | bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); |
1090 | 1109 | ||
1091 | dc->sequential_merge = true; | ||
1092 | dc->sequential_cutoff = 4 << 20; | 1110 | dc->sequential_cutoff = 4 << 20; |
1093 | 1111 | ||
1094 | for (io = dc->io; io < dc->io + RECENT_IO; io++) { | 1112 | for (io = dc->io; io < dc->io + RECENT_IO; io++) { |
@@ -1260,7 +1278,8 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) | |||
1260 | { | 1278 | { |
1261 | va_list args; | 1279 | va_list args; |
1262 | 1280 | ||
1263 | if (test_bit(CACHE_SET_STOPPING, &c->flags)) | 1281 | if (c->on_error != ON_ERROR_PANIC && |
1282 | test_bit(CACHE_SET_STOPPING, &c->flags)) | ||
1264 | return false; | 1283 | return false; |
1265 | 1284 | ||
1266 | /* XXX: we can be called from atomic context | 1285 | /* XXX: we can be called from atomic context |
@@ -1275,6 +1294,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) | |||
1275 | 1294 | ||
1276 | printk(", disabling caching\n"); | 1295 | printk(", disabling caching\n"); |
1277 | 1296 | ||
1297 | if (c->on_error == ON_ERROR_PANIC) | ||
1298 | panic("panic forced after error\n"); | ||
1299 | |||
1278 | bch_cache_set_unregister(c); | 1300 | bch_cache_set_unregister(c); |
1279 | return true; | 1301 | return true; |
1280 | } | 1302 | } |
@@ -1339,6 +1361,9 @@ static void cache_set_flush(struct closure *cl) | |||
1339 | kobject_put(&c->internal); | 1361 | kobject_put(&c->internal); |
1340 | kobject_del(&c->kobj); | 1362 | kobject_del(&c->kobj); |
1341 | 1363 | ||
1364 | if (c->gc_thread) | ||
1365 | kthread_stop(c->gc_thread); | ||
1366 | |||
1342 | if (!IS_ERR_OR_NULL(c->root)) | 1367 | if (!IS_ERR_OR_NULL(c->root)) |
1343 | list_add(&c->root->list, &c->btree_cache); | 1368 | list_add(&c->root->list, &c->btree_cache); |
1344 | 1369 | ||
@@ -1433,12 +1458,19 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
1433 | 1458 | ||
1434 | c->sort_crit_factor = int_sqrt(c->btree_pages); | 1459 | c->sort_crit_factor = int_sqrt(c->btree_pages); |
1435 | 1460 | ||
1436 | mutex_init(&c->bucket_lock); | ||
1437 | mutex_init(&c->sort_lock); | ||
1438 | spin_lock_init(&c->sort_time_lock); | ||
1439 | closure_init_unlocked(&c->sb_write); | 1461 | closure_init_unlocked(&c->sb_write); |
1462 | mutex_init(&c->bucket_lock); | ||
1463 | init_waitqueue_head(&c->try_wait); | ||
1464 | init_waitqueue_head(&c->bucket_wait); | ||
1440 | closure_init_unlocked(&c->uuid_write); | 1465 | closure_init_unlocked(&c->uuid_write); |
1441 | spin_lock_init(&c->btree_read_time_lock); | 1466 | mutex_init(&c->sort_lock); |
1467 | |||
1468 | spin_lock_init(&c->sort_time.lock); | ||
1469 | spin_lock_init(&c->btree_gc_time.lock); | ||
1470 | spin_lock_init(&c->btree_split_time.lock); | ||
1471 | spin_lock_init(&c->btree_read_time.lock); | ||
1472 | spin_lock_init(&c->try_harder_time.lock); | ||
1473 | |||
1442 | bch_moving_init_cache_set(c); | 1474 | bch_moving_init_cache_set(c); |
1443 | 1475 | ||
1444 | INIT_LIST_HEAD(&c->list); | 1476 | INIT_LIST_HEAD(&c->list); |
@@ -1483,11 +1515,10 @@ static void run_cache_set(struct cache_set *c) | |||
1483 | const char *err = "cannot allocate memory"; | 1515 | const char *err = "cannot allocate memory"; |
1484 | struct cached_dev *dc, *t; | 1516 | struct cached_dev *dc, *t; |
1485 | struct cache *ca; | 1517 | struct cache *ca; |
1518 | struct closure cl; | ||
1486 | unsigned i; | 1519 | unsigned i; |
1487 | 1520 | ||
1488 | struct btree_op op; | 1521 | closure_init_stack(&cl); |
1489 | bch_btree_op_init_stack(&op); | ||
1490 | op.lock = SHRT_MAX; | ||
1491 | 1522 | ||
1492 | for_each_cache(ca, c, i) | 1523 | for_each_cache(ca, c, i) |
1493 | c->nbuckets += ca->sb.nbuckets; | 1524 | c->nbuckets += ca->sb.nbuckets; |
@@ -1498,7 +1529,7 @@ static void run_cache_set(struct cache_set *c) | |||
1498 | struct jset *j; | 1529 | struct jset *j; |
1499 | 1530 | ||
1500 | err = "cannot allocate memory for journal"; | 1531 | err = "cannot allocate memory for journal"; |
1501 | if (bch_journal_read(c, &journal, &op)) | 1532 | if (bch_journal_read(c, &journal)) |
1502 | goto err; | 1533 | goto err; |
1503 | 1534 | ||
1504 | pr_debug("btree_journal_read() done"); | 1535 | pr_debug("btree_journal_read() done"); |
@@ -1522,23 +1553,23 @@ static void run_cache_set(struct cache_set *c) | |||
1522 | k = &j->btree_root; | 1553 | k = &j->btree_root; |
1523 | 1554 | ||
1524 | err = "bad btree root"; | 1555 | err = "bad btree root"; |
1525 | if (__bch_ptr_invalid(c, j->btree_level + 1, k)) | 1556 | if (bch_btree_ptr_invalid(c, k)) |
1526 | goto err; | 1557 | goto err; |
1527 | 1558 | ||
1528 | err = "error reading btree root"; | 1559 | err = "error reading btree root"; |
1529 | c->root = bch_btree_node_get(c, k, j->btree_level, &op); | 1560 | c->root = bch_btree_node_get(c, k, j->btree_level, true); |
1530 | if (IS_ERR_OR_NULL(c->root)) | 1561 | if (IS_ERR_OR_NULL(c->root)) |
1531 | goto err; | 1562 | goto err; |
1532 | 1563 | ||
1533 | list_del_init(&c->root->list); | 1564 | list_del_init(&c->root->list); |
1534 | rw_unlock(true, c->root); | 1565 | rw_unlock(true, c->root); |
1535 | 1566 | ||
1536 | err = uuid_read(c, j, &op.cl); | 1567 | err = uuid_read(c, j, &cl); |
1537 | if (err) | 1568 | if (err) |
1538 | goto err; | 1569 | goto err; |
1539 | 1570 | ||
1540 | err = "error in recovery"; | 1571 | err = "error in recovery"; |
1541 | if (bch_btree_check(c, &op)) | 1572 | if (bch_btree_check(c)) |
1542 | goto err; | 1573 | goto err; |
1543 | 1574 | ||
1544 | bch_journal_mark(c, &journal); | 1575 | bch_journal_mark(c, &journal); |
@@ -1570,11 +1601,9 @@ static void run_cache_set(struct cache_set *c) | |||
1570 | if (j->version < BCACHE_JSET_VERSION_UUID) | 1601 | if (j->version < BCACHE_JSET_VERSION_UUID) |
1571 | __uuid_write(c); | 1602 | __uuid_write(c); |
1572 | 1603 | ||
1573 | bch_journal_replay(c, &journal, &op); | 1604 | bch_journal_replay(c, &journal); |
1574 | } else { | 1605 | } else { |
1575 | pr_notice("invalidating existing data"); | 1606 | pr_notice("invalidating existing data"); |
1576 | /* Don't want invalidate_buckets() to queue a gc yet */ | ||
1577 | closure_lock(&c->gc, NULL); | ||
1578 | 1607 | ||
1579 | for_each_cache(ca, c, i) { | 1608 | for_each_cache(ca, c, i) { |
1580 | unsigned j; | 1609 | unsigned j; |
@@ -1600,15 +1629,15 @@ static void run_cache_set(struct cache_set *c) | |||
1600 | 1629 | ||
1601 | err = "cannot allocate new UUID bucket"; | 1630 | err = "cannot allocate new UUID bucket"; |
1602 | if (__uuid_write(c)) | 1631 | if (__uuid_write(c)) |
1603 | goto err_unlock_gc; | 1632 | goto err; |
1604 | 1633 | ||
1605 | err = "cannot allocate new btree root"; | 1634 | err = "cannot allocate new btree root"; |
1606 | c->root = bch_btree_node_alloc(c, 0, &op.cl); | 1635 | c->root = bch_btree_node_alloc(c, 0, true); |
1607 | if (IS_ERR_OR_NULL(c->root)) | 1636 | if (IS_ERR_OR_NULL(c->root)) |
1608 | goto err_unlock_gc; | 1637 | goto err; |
1609 | 1638 | ||
1610 | bkey_copy_key(&c->root->key, &MAX_KEY); | 1639 | bkey_copy_key(&c->root->key, &MAX_KEY); |
1611 | bch_btree_node_write(c->root, &op.cl); | 1640 | bch_btree_node_write(c->root, &cl); |
1612 | 1641 | ||
1613 | bch_btree_set_root(c->root); | 1642 | bch_btree_set_root(c->root); |
1614 | rw_unlock(true, c->root); | 1643 | rw_unlock(true, c->root); |
@@ -1621,14 +1650,14 @@ static void run_cache_set(struct cache_set *c) | |||
1621 | SET_CACHE_SYNC(&c->sb, true); | 1650 | SET_CACHE_SYNC(&c->sb, true); |
1622 | 1651 | ||
1623 | bch_journal_next(&c->journal); | 1652 | bch_journal_next(&c->journal); |
1624 | bch_journal_meta(c, &op.cl); | 1653 | bch_journal_meta(c, &cl); |
1625 | |||
1626 | /* Unlock */ | ||
1627 | closure_set_stopped(&c->gc.cl); | ||
1628 | closure_put(&c->gc.cl); | ||
1629 | } | 1654 | } |
1630 | 1655 | ||
1631 | closure_sync(&op.cl); | 1656 | err = "error starting gc thread"; |
1657 | if (bch_gc_thread_start(c)) | ||
1658 | goto err; | ||
1659 | |||
1660 | closure_sync(&cl); | ||
1632 | c->sb.last_mount = get_seconds(); | 1661 | c->sb.last_mount = get_seconds(); |
1633 | bcache_write_super(c); | 1662 | bcache_write_super(c); |
1634 | 1663 | ||
@@ -1638,13 +1667,10 @@ static void run_cache_set(struct cache_set *c) | |||
1638 | flash_devs_run(c); | 1667 | flash_devs_run(c); |
1639 | 1668 | ||
1640 | return; | 1669 | return; |
1641 | err_unlock_gc: | ||
1642 | closure_set_stopped(&c->gc.cl); | ||
1643 | closure_put(&c->gc.cl); | ||
1644 | err: | 1670 | err: |
1645 | closure_sync(&op.cl); | 1671 | closure_sync(&cl); |
1646 | /* XXX: test this, it's broken */ | 1672 | /* XXX: test this, it's broken */ |
1647 | bch_cache_set_error(c, err); | 1673 | bch_cache_set_error(c, "%s", err); |
1648 | } | 1674 | } |
1649 | 1675 | ||
1650 | static bool can_attach_cache(struct cache *ca, struct cache_set *c) | 1676 | static bool can_attach_cache(struct cache *ca, struct cache_set *c) |
@@ -1725,8 +1751,6 @@ void bch_cache_release(struct kobject *kobj) | |||
1725 | if (ca->set) | 1751 | if (ca->set) |
1726 | ca->set->cache[ca->sb.nr_this_dev] = NULL; | 1752 | ca->set->cache[ca->sb.nr_this_dev] = NULL; |
1727 | 1753 | ||
1728 | bch_cache_allocator_exit(ca); | ||
1729 | |||
1730 | bio_split_pool_free(&ca->bio_split_hook); | 1754 | bio_split_pool_free(&ca->bio_split_hook); |
1731 | 1755 | ||
1732 | free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); | 1756 | free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); |
@@ -1758,8 +1782,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) | |||
1758 | __module_get(THIS_MODULE); | 1782 | __module_get(THIS_MODULE); |
1759 | kobject_init(&ca->kobj, &bch_cache_ktype); | 1783 | kobject_init(&ca->kobj, &bch_cache_ktype); |
1760 | 1784 | ||
1761 | INIT_LIST_HEAD(&ca->discards); | ||
1762 | |||
1763 | bio_init(&ca->journal.bio); | 1785 | bio_init(&ca->journal.bio); |
1764 | ca->journal.bio.bi_max_vecs = 8; | 1786 | ca->journal.bio.bi_max_vecs = 8; |
1765 | ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; | 1787 | ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; |
@@ -2006,7 +2028,6 @@ static struct notifier_block reboot = { | |||
2006 | static void bcache_exit(void) | 2028 | static void bcache_exit(void) |
2007 | { | 2029 | { |
2008 | bch_debug_exit(); | 2030 | bch_debug_exit(); |
2009 | bch_writeback_exit(); | ||
2010 | bch_request_exit(); | 2031 | bch_request_exit(); |
2011 | bch_btree_exit(); | 2032 | bch_btree_exit(); |
2012 | if (bcache_kobj) | 2033 | if (bcache_kobj) |
@@ -2039,7 +2060,6 @@ static int __init bcache_init(void) | |||
2039 | sysfs_create_files(bcache_kobj, files) || | 2060 | sysfs_create_files(bcache_kobj, files) || |
2040 | bch_btree_init() || | 2061 | bch_btree_init() || |
2041 | bch_request_init() || | 2062 | bch_request_init() || |
2042 | bch_writeback_init() || | ||
2043 | bch_debug_init(bcache_kobj)) | 2063 | bch_debug_init(bcache_kobj)) |
2044 | goto err; | 2064 | goto err; |
2045 | 2065 | ||
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 924dcfdae111..80d4c2bee18a 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c | |||
@@ -21,6 +21,12 @@ static const char * const cache_replacement_policies[] = { | |||
21 | NULL | 21 | NULL |
22 | }; | 22 | }; |
23 | 23 | ||
24 | static const char * const error_actions[] = { | ||
25 | "unregister", | ||
26 | "panic", | ||
27 | NULL | ||
28 | }; | ||
29 | |||
24 | write_attribute(attach); | 30 | write_attribute(attach); |
25 | write_attribute(detach); | 31 | write_attribute(detach); |
26 | write_attribute(unregister); | 32 | write_attribute(unregister); |
@@ -66,7 +72,6 @@ rw_attribute(congested_read_threshold_us); | |||
66 | rw_attribute(congested_write_threshold_us); | 72 | rw_attribute(congested_write_threshold_us); |
67 | 73 | ||
68 | rw_attribute(sequential_cutoff); | 74 | rw_attribute(sequential_cutoff); |
69 | rw_attribute(sequential_merge); | ||
70 | rw_attribute(data_csum); | 75 | rw_attribute(data_csum); |
71 | rw_attribute(cache_mode); | 76 | rw_attribute(cache_mode); |
72 | rw_attribute(writeback_metadata); | 77 | rw_attribute(writeback_metadata); |
@@ -90,11 +95,14 @@ rw_attribute(discard); | |||
90 | rw_attribute(running); | 95 | rw_attribute(running); |
91 | rw_attribute(label); | 96 | rw_attribute(label); |
92 | rw_attribute(readahead); | 97 | rw_attribute(readahead); |
98 | rw_attribute(errors); | ||
93 | rw_attribute(io_error_limit); | 99 | rw_attribute(io_error_limit); |
94 | rw_attribute(io_error_halflife); | 100 | rw_attribute(io_error_halflife); |
95 | rw_attribute(verify); | 101 | rw_attribute(verify); |
102 | rw_attribute(bypass_torture_test); | ||
96 | rw_attribute(key_merging_disabled); | 103 | rw_attribute(key_merging_disabled); |
97 | rw_attribute(gc_always_rewrite); | 104 | rw_attribute(gc_always_rewrite); |
105 | rw_attribute(expensive_debug_checks); | ||
98 | rw_attribute(freelist_percent); | 106 | rw_attribute(freelist_percent); |
99 | rw_attribute(cache_replacement_policy); | 107 | rw_attribute(cache_replacement_policy); |
100 | rw_attribute(btree_shrinker_disabled); | 108 | rw_attribute(btree_shrinker_disabled); |
@@ -116,6 +124,7 @@ SHOW(__bch_cached_dev) | |||
116 | 124 | ||
117 | sysfs_printf(data_csum, "%i", dc->disk.data_csum); | 125 | sysfs_printf(data_csum, "%i", dc->disk.data_csum); |
118 | var_printf(verify, "%i"); | 126 | var_printf(verify, "%i"); |
127 | var_printf(bypass_torture_test, "%i"); | ||
119 | var_printf(writeback_metadata, "%i"); | 128 | var_printf(writeback_metadata, "%i"); |
120 | var_printf(writeback_running, "%i"); | 129 | var_printf(writeback_running, "%i"); |
121 | var_print(writeback_delay); | 130 | var_print(writeback_delay); |
@@ -150,10 +159,9 @@ SHOW(__bch_cached_dev) | |||
150 | sysfs_hprint(dirty_data, | 159 | sysfs_hprint(dirty_data, |
151 | bcache_dev_sectors_dirty(&dc->disk) << 9); | 160 | bcache_dev_sectors_dirty(&dc->disk) << 9); |
152 | 161 | ||
153 | sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9); | 162 | sysfs_hprint(stripe_size, dc->disk.stripe_size << 9); |
154 | var_printf(partial_stripes_expensive, "%u"); | 163 | var_printf(partial_stripes_expensive, "%u"); |
155 | 164 | ||
156 | var_printf(sequential_merge, "%i"); | ||
157 | var_hprint(sequential_cutoff); | 165 | var_hprint(sequential_cutoff); |
158 | var_hprint(readahead); | 166 | var_hprint(readahead); |
159 | 167 | ||
@@ -185,6 +193,7 @@ STORE(__cached_dev) | |||
185 | 193 | ||
186 | sysfs_strtoul(data_csum, dc->disk.data_csum); | 194 | sysfs_strtoul(data_csum, dc->disk.data_csum); |
187 | d_strtoul(verify); | 195 | d_strtoul(verify); |
196 | d_strtoul(bypass_torture_test); | ||
188 | d_strtoul(writeback_metadata); | 197 | d_strtoul(writeback_metadata); |
189 | d_strtoul(writeback_running); | 198 | d_strtoul(writeback_running); |
190 | d_strtoul(writeback_delay); | 199 | d_strtoul(writeback_delay); |
@@ -199,7 +208,6 @@ STORE(__cached_dev) | |||
199 | dc->writeback_rate_p_term_inverse, 1, INT_MAX); | 208 | dc->writeback_rate_p_term_inverse, 1, INT_MAX); |
200 | d_strtoul(writeback_rate_d_smooth); | 209 | d_strtoul(writeback_rate_d_smooth); |
201 | 210 | ||
202 | d_strtoul(sequential_merge); | ||
203 | d_strtoi_h(sequential_cutoff); | 211 | d_strtoi_h(sequential_cutoff); |
204 | d_strtoi_h(readahead); | 212 | d_strtoi_h(readahead); |
205 | 213 | ||
@@ -311,7 +319,6 @@ static struct attribute *bch_cached_dev_files[] = { | |||
311 | &sysfs_stripe_size, | 319 | &sysfs_stripe_size, |
312 | &sysfs_partial_stripes_expensive, | 320 | &sysfs_partial_stripes_expensive, |
313 | &sysfs_sequential_cutoff, | 321 | &sysfs_sequential_cutoff, |
314 | &sysfs_sequential_merge, | ||
315 | &sysfs_clear_stats, | 322 | &sysfs_clear_stats, |
316 | &sysfs_running, | 323 | &sysfs_running, |
317 | &sysfs_state, | 324 | &sysfs_state, |
@@ -319,6 +326,7 @@ static struct attribute *bch_cached_dev_files[] = { | |||
319 | &sysfs_readahead, | 326 | &sysfs_readahead, |
320 | #ifdef CONFIG_BCACHE_DEBUG | 327 | #ifdef CONFIG_BCACHE_DEBUG |
321 | &sysfs_verify, | 328 | &sysfs_verify, |
329 | &sysfs_bypass_torture_test, | ||
322 | #endif | 330 | #endif |
323 | NULL | 331 | NULL |
324 | }; | 332 | }; |
@@ -366,7 +374,7 @@ STORE(__bch_flash_dev) | |||
366 | } | 374 | } |
367 | 375 | ||
368 | if (attr == &sysfs_unregister) { | 376 | if (attr == &sysfs_unregister) { |
369 | atomic_set(&d->detaching, 1); | 377 | set_bit(BCACHE_DEV_DETACHING, &d->flags); |
370 | bcache_device_stop(d); | 378 | bcache_device_stop(d); |
371 | } | 379 | } |
372 | 380 | ||
@@ -481,7 +489,6 @@ lock_root: | |||
481 | 489 | ||
482 | sysfs_print(btree_used_percent, btree_used(c)); | 490 | sysfs_print(btree_used_percent, btree_used(c)); |
483 | sysfs_print(btree_nodes, c->gc_stats.nodes); | 491 | sysfs_print(btree_nodes, c->gc_stats.nodes); |
484 | sysfs_hprint(dirty_data, c->gc_stats.dirty); | ||
485 | sysfs_hprint(average_key_size, average_key_size(c)); | 492 | sysfs_hprint(average_key_size, average_key_size(c)); |
486 | 493 | ||
487 | sysfs_print(cache_read_races, | 494 | sysfs_print(cache_read_races, |
@@ -492,6 +499,10 @@ lock_root: | |||
492 | sysfs_print(writeback_keys_failed, | 499 | sysfs_print(writeback_keys_failed, |
493 | atomic_long_read(&c->writeback_keys_failed)); | 500 | atomic_long_read(&c->writeback_keys_failed)); |
494 | 501 | ||
502 | if (attr == &sysfs_errors) | ||
503 | return bch_snprint_string_list(buf, PAGE_SIZE, error_actions, | ||
504 | c->on_error); | ||
505 | |||
495 | /* See count_io_errors for why 88 */ | 506 | /* See count_io_errors for why 88 */ |
496 | sysfs_print(io_error_halflife, c->error_decay * 88); | 507 | sysfs_print(io_error_halflife, c->error_decay * 88); |
497 | sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); | 508 | sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); |
@@ -506,6 +517,8 @@ lock_root: | |||
506 | sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); | 517 | sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); |
507 | sysfs_printf(verify, "%i", c->verify); | 518 | sysfs_printf(verify, "%i", c->verify); |
508 | sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); | 519 | sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); |
520 | sysfs_printf(expensive_debug_checks, | ||
521 | "%i", c->expensive_debug_checks); | ||
509 | sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); | 522 | sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); |
510 | sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); | 523 | sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); |
511 | sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); | 524 | sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); |
@@ -555,7 +568,7 @@ STORE(__bch_cache_set) | |||
555 | } | 568 | } |
556 | 569 | ||
557 | if (attr == &sysfs_trigger_gc) | 570 | if (attr == &sysfs_trigger_gc) |
558 | bch_queue_gc(c); | 571 | wake_up_gc(c); |
559 | 572 | ||
560 | if (attr == &sysfs_prune_cache) { | 573 | if (attr == &sysfs_prune_cache) { |
561 | struct shrink_control sc; | 574 | struct shrink_control sc; |
@@ -569,6 +582,15 @@ STORE(__bch_cache_set) | |||
569 | sysfs_strtoul(congested_write_threshold_us, | 582 | sysfs_strtoul(congested_write_threshold_us, |
570 | c->congested_write_threshold_us); | 583 | c->congested_write_threshold_us); |
571 | 584 | ||
585 | if (attr == &sysfs_errors) { | ||
586 | ssize_t v = bch_read_string_list(buf, error_actions); | ||
587 | |||
588 | if (v < 0) | ||
589 | return v; | ||
590 | |||
591 | c->on_error = v; | ||
592 | } | ||
593 | |||
572 | if (attr == &sysfs_io_error_limit) | 594 | if (attr == &sysfs_io_error_limit) |
573 | c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; | 595 | c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; |
574 | 596 | ||
@@ -579,6 +601,7 @@ STORE(__bch_cache_set) | |||
579 | sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); | 601 | sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); |
580 | sysfs_strtoul(verify, c->verify); | 602 | sysfs_strtoul(verify, c->verify); |
581 | sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); | 603 | sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); |
604 | sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks); | ||
582 | sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); | 605 | sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); |
583 | sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); | 606 | sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); |
584 | sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); | 607 | sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); |
@@ -618,8 +641,8 @@ static struct attribute *bch_cache_set_files[] = { | |||
618 | &sysfs_cache_available_percent, | 641 | &sysfs_cache_available_percent, |
619 | 642 | ||
620 | &sysfs_average_key_size, | 643 | &sysfs_average_key_size, |
621 | &sysfs_dirty_data, | ||
622 | 644 | ||
645 | &sysfs_errors, | ||
623 | &sysfs_io_error_limit, | 646 | &sysfs_io_error_limit, |
624 | &sysfs_io_error_halflife, | 647 | &sysfs_io_error_halflife, |
625 | &sysfs_congested, | 648 | &sysfs_congested, |
@@ -653,6 +676,7 @@ static struct attribute *bch_cache_set_internal_files[] = { | |||
653 | #ifdef CONFIG_BCACHE_DEBUG | 676 | #ifdef CONFIG_BCACHE_DEBUG |
654 | &sysfs_verify, | 677 | &sysfs_verify, |
655 | &sysfs_key_merging_disabled, | 678 | &sysfs_key_merging_disabled, |
679 | &sysfs_expensive_debug_checks, | ||
656 | #endif | 680 | #endif |
657 | &sysfs_gc_always_rewrite, | 681 | &sysfs_gc_always_rewrite, |
658 | &sysfs_btree_shrinker_disabled, | 682 | &sysfs_btree_shrinker_disabled, |
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index f7b6c197f90f..adbc3df17a80 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c | |||
@@ -1,6 +1,5 @@ | |||
1 | #include "bcache.h" | 1 | #include "bcache.h" |
2 | #include "btree.h" | 2 | #include "btree.h" |
3 | #include "request.h" | ||
4 | 3 | ||
5 | #include <linux/blktrace_api.h> | 4 | #include <linux/blktrace_api.h> |
6 | #include <linux/module.h> | 5 | #include <linux/module.h> |
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 420dad545c7d..462214eeacbe 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c | |||
@@ -168,10 +168,14 @@ int bch_parse_uuid(const char *s, char *uuid) | |||
168 | 168 | ||
169 | void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) | 169 | void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) |
170 | { | 170 | { |
171 | uint64_t now = local_clock(); | 171 | uint64_t now, duration, last; |
172 | uint64_t duration = time_after64(now, start_time) | 172 | |
173 | spin_lock(&stats->lock); | ||
174 | |||
175 | now = local_clock(); | ||
176 | duration = time_after64(now, start_time) | ||
173 | ? now - start_time : 0; | 177 | ? now - start_time : 0; |
174 | uint64_t last = time_after64(now, stats->last) | 178 | last = time_after64(now, stats->last) |
175 | ? now - stats->last : 0; | 179 | ? now - stats->last : 0; |
176 | 180 | ||
177 | stats->max_duration = max(stats->max_duration, duration); | 181 | stats->max_duration = max(stats->max_duration, duration); |
@@ -188,6 +192,8 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) | |||
188 | } | 192 | } |
189 | 193 | ||
190 | stats->last = now ?: 1; | 194 | stats->last = now ?: 1; |
195 | |||
196 | spin_unlock(&stats->lock); | ||
191 | } | 197 | } |
192 | 198 | ||
193 | /** | 199 | /** |
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index ea345c6896f4..362c4b3f8b4a 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h | |||
@@ -15,28 +15,18 @@ | |||
15 | 15 | ||
16 | struct closure; | 16 | struct closure; |
17 | 17 | ||
18 | #ifdef CONFIG_BCACHE_EDEBUG | 18 | #ifdef CONFIG_BCACHE_DEBUG |
19 | 19 | ||
20 | #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) | 20 | #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) |
21 | #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) | 21 | #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) |
22 | 22 | ||
23 | #else /* EDEBUG */ | 23 | #else /* DEBUG */ |
24 | 24 | ||
25 | #define atomic_dec_bug(v) atomic_dec(v) | 25 | #define atomic_dec_bug(v) atomic_dec(v) |
26 | #define atomic_inc_bug(v, i) atomic_inc(v) | 26 | #define atomic_inc_bug(v, i) atomic_inc(v) |
27 | 27 | ||
28 | #endif | 28 | #endif |
29 | 29 | ||
30 | #define BITMASK(name, type, field, offset, size) \ | ||
31 | static inline uint64_t name(const type *k) \ | ||
32 | { return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \ | ||
33 | \ | ||
34 | static inline void SET_##name(type *k, uint64_t v) \ | ||
35 | { \ | ||
36 | k->field &= ~(~((uint64_t) ~0 << size) << offset); \ | ||
37 | k->field |= v << offset; \ | ||
38 | } | ||
39 | |||
40 | #define DECLARE_HEAP(type, name) \ | 30 | #define DECLARE_HEAP(type, name) \ |
41 | struct { \ | 31 | struct { \ |
42 | size_t size, used; \ | 32 | size_t size, used; \ |
@@ -388,6 +378,7 @@ ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[ | |||
388 | ssize_t bch_read_string_list(const char *buf, const char * const list[]); | 378 | ssize_t bch_read_string_list(const char *buf, const char * const list[]); |
389 | 379 | ||
390 | struct time_stats { | 380 | struct time_stats { |
381 | spinlock_t lock; | ||
391 | /* | 382 | /* |
392 | * all fields are in nanoseconds, averages are ewmas stored left shifted | 383 | * all fields are in nanoseconds, averages are ewmas stored left shifted |
393 | * by 8 | 384 | * by 8 |
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index ba3ee48320f2..99053b1251be 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c | |||
@@ -11,18 +11,11 @@ | |||
11 | #include "debug.h" | 11 | #include "debug.h" |
12 | #include "writeback.h" | 12 | #include "writeback.h" |
13 | 13 | ||
14 | #include <linux/delay.h> | ||
15 | #include <linux/freezer.h> | ||
16 | #include <linux/kthread.h> | ||
14 | #include <trace/events/bcache.h> | 17 | #include <trace/events/bcache.h> |
15 | 18 | ||
16 | static struct workqueue_struct *dirty_wq; | ||
17 | |||
18 | static void read_dirty(struct closure *); | ||
19 | |||
20 | struct dirty_io { | ||
21 | struct closure cl; | ||
22 | struct cached_dev *dc; | ||
23 | struct bio bio; | ||
24 | }; | ||
25 | |||
26 | /* Rate limiting */ | 19 | /* Rate limiting */ |
27 | 20 | ||
28 | static void __update_writeback_rate(struct cached_dev *dc) | 21 | static void __update_writeback_rate(struct cached_dev *dc) |
@@ -72,9 +65,6 @@ out: | |||
72 | dc->writeback_rate_derivative = derivative; | 65 | dc->writeback_rate_derivative = derivative; |
73 | dc->writeback_rate_change = change; | 66 | dc->writeback_rate_change = change; |
74 | dc->writeback_rate_target = target; | 67 | dc->writeback_rate_target = target; |
75 | |||
76 | schedule_delayed_work(&dc->writeback_rate_update, | ||
77 | dc->writeback_rate_update_seconds * HZ); | ||
78 | } | 68 | } |
79 | 69 | ||
80 | static void update_writeback_rate(struct work_struct *work) | 70 | static void update_writeback_rate(struct work_struct *work) |
@@ -90,13 +80,16 @@ static void update_writeback_rate(struct work_struct *work) | |||
90 | __update_writeback_rate(dc); | 80 | __update_writeback_rate(dc); |
91 | 81 | ||
92 | up_read(&dc->writeback_lock); | 82 | up_read(&dc->writeback_lock); |
83 | |||
84 | schedule_delayed_work(&dc->writeback_rate_update, | ||
85 | dc->writeback_rate_update_seconds * HZ); | ||
93 | } | 86 | } |
94 | 87 | ||
95 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | 88 | static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) |
96 | { | 89 | { |
97 | uint64_t ret; | 90 | uint64_t ret; |
98 | 91 | ||
99 | if (atomic_read(&dc->disk.detaching) || | 92 | if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || |
100 | !dc->writeback_percent) | 93 | !dc->writeback_percent) |
101 | return 0; | 94 | return 0; |
102 | 95 | ||
@@ -105,37 +98,11 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) | |||
105 | return min_t(uint64_t, ret, HZ); | 98 | return min_t(uint64_t, ret, HZ); |
106 | } | 99 | } |
107 | 100 | ||
108 | /* Background writeback */ | 101 | struct dirty_io { |
109 | 102 | struct closure cl; | |
110 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) | 103 | struct cached_dev *dc; |
111 | { | 104 | struct bio bio; |
112 | return KEY_DIRTY(k); | 105 | }; |
113 | } | ||
114 | |||
115 | static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) | ||
116 | { | ||
117 | uint64_t stripe; | ||
118 | unsigned nr_sectors = KEY_SIZE(k); | ||
119 | struct cached_dev *dc = container_of(buf, struct cached_dev, | ||
120 | writeback_keys); | ||
121 | unsigned stripe_size = 1 << dc->disk.stripe_size_bits; | ||
122 | |||
123 | if (!KEY_DIRTY(k)) | ||
124 | return false; | ||
125 | |||
126 | stripe = KEY_START(k) >> dc->disk.stripe_size_bits; | ||
127 | while (1) { | ||
128 | if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != | ||
129 | stripe_size) | ||
130 | return false; | ||
131 | |||
132 | if (nr_sectors <= stripe_size) | ||
133 | return true; | ||
134 | |||
135 | nr_sectors -= stripe_size; | ||
136 | stripe++; | ||
137 | } | ||
138 | } | ||
139 | 106 | ||
140 | static void dirty_init(struct keybuf_key *w) | 107 | static void dirty_init(struct keybuf_key *w) |
141 | { | 108 | { |
@@ -153,131 +120,6 @@ static void dirty_init(struct keybuf_key *w) | |||
153 | bch_bio_map(bio, NULL); | 120 | bch_bio_map(bio, NULL); |
154 | } | 121 | } |
155 | 122 | ||
156 | static void refill_dirty(struct closure *cl) | ||
157 | { | ||
158 | struct cached_dev *dc = container_of(cl, struct cached_dev, | ||
159 | writeback.cl); | ||
160 | struct keybuf *buf = &dc->writeback_keys; | ||
161 | bool searched_from_start = false; | ||
162 | struct bkey end = MAX_KEY; | ||
163 | SET_KEY_INODE(&end, dc->disk.id); | ||
164 | |||
165 | if (!atomic_read(&dc->disk.detaching) && | ||
166 | !dc->writeback_running) | ||
167 | closure_return(cl); | ||
168 | |||
169 | down_write(&dc->writeback_lock); | ||
170 | |||
171 | if (!atomic_read(&dc->has_dirty)) { | ||
172 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); | ||
173 | bch_write_bdev_super(dc, NULL); | ||
174 | |||
175 | up_write(&dc->writeback_lock); | ||
176 | closure_return(cl); | ||
177 | } | ||
178 | |||
179 | if (bkey_cmp(&buf->last_scanned, &end) >= 0) { | ||
180 | buf->last_scanned = KEY(dc->disk.id, 0, 0); | ||
181 | searched_from_start = true; | ||
182 | } | ||
183 | |||
184 | if (dc->partial_stripes_expensive) { | ||
185 | uint64_t i; | ||
186 | |||
187 | for (i = 0; i < dc->disk.nr_stripes; i++) | ||
188 | if (atomic_read(dc->disk.stripe_sectors_dirty + i) == | ||
189 | 1 << dc->disk.stripe_size_bits) | ||
190 | goto full_stripes; | ||
191 | |||
192 | goto normal_refill; | ||
193 | full_stripes: | ||
194 | bch_refill_keybuf(dc->disk.c, buf, &end, | ||
195 | dirty_full_stripe_pred); | ||
196 | } else { | ||
197 | normal_refill: | ||
198 | bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); | ||
199 | } | ||
200 | |||
201 | if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { | ||
202 | /* Searched the entire btree - delay awhile */ | ||
203 | |||
204 | if (RB_EMPTY_ROOT(&buf->keys)) { | ||
205 | atomic_set(&dc->has_dirty, 0); | ||
206 | cached_dev_put(dc); | ||
207 | } | ||
208 | |||
209 | if (!atomic_read(&dc->disk.detaching)) | ||
210 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | ||
211 | } | ||
212 | |||
213 | up_write(&dc->writeback_lock); | ||
214 | |||
215 | bch_ratelimit_reset(&dc->writeback_rate); | ||
216 | |||
217 | /* Punt to workqueue only so we don't recurse and blow the stack */ | ||
218 | continue_at(cl, read_dirty, dirty_wq); | ||
219 | } | ||
220 | |||
221 | void bch_writeback_queue(struct cached_dev *dc) | ||
222 | { | ||
223 | if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { | ||
224 | if (!atomic_read(&dc->disk.detaching)) | ||
225 | closure_delay(&dc->writeback, dc->writeback_delay * HZ); | ||
226 | |||
227 | continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); | ||
228 | } | ||
229 | } | ||
230 | |||
231 | void bch_writeback_add(struct cached_dev *dc) | ||
232 | { | ||
233 | if (!atomic_read(&dc->has_dirty) && | ||
234 | !atomic_xchg(&dc->has_dirty, 1)) { | ||
235 | atomic_inc(&dc->count); | ||
236 | |||
237 | if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { | ||
238 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); | ||
239 | /* XXX: should do this synchronously */ | ||
240 | bch_write_bdev_super(dc, NULL); | ||
241 | } | ||
242 | |||
243 | bch_writeback_queue(dc); | ||
244 | |||
245 | if (dc->writeback_percent) | ||
246 | schedule_delayed_work(&dc->writeback_rate_update, | ||
247 | dc->writeback_rate_update_seconds * HZ); | ||
248 | } | ||
249 | } | ||
250 | |||
251 | void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, | ||
252 | uint64_t offset, int nr_sectors) | ||
253 | { | ||
254 | struct bcache_device *d = c->devices[inode]; | ||
255 | unsigned stripe_size, stripe_offset; | ||
256 | uint64_t stripe; | ||
257 | |||
258 | if (!d) | ||
259 | return; | ||
260 | |||
261 | stripe_size = 1 << d->stripe_size_bits; | ||
262 | stripe = offset >> d->stripe_size_bits; | ||
263 | stripe_offset = offset & (stripe_size - 1); | ||
264 | |||
265 | while (nr_sectors) { | ||
266 | int s = min_t(unsigned, abs(nr_sectors), | ||
267 | stripe_size - stripe_offset); | ||
268 | |||
269 | if (nr_sectors < 0) | ||
270 | s = -s; | ||
271 | |||
272 | atomic_add(s, d->stripe_sectors_dirty + stripe); | ||
273 | nr_sectors -= s; | ||
274 | stripe_offset = 0; | ||
275 | stripe++; | ||
276 | } | ||
277 | } | ||
278 | |||
279 | /* Background writeback - IO loop */ | ||
280 | |||
281 | static void dirty_io_destructor(struct closure *cl) | 123 | static void dirty_io_destructor(struct closure *cl) |
282 | { | 124 | { |
283 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 125 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
@@ -297,26 +139,25 @@ static void write_dirty_finish(struct closure *cl) | |||
297 | 139 | ||
298 | /* This is kind of a dumb way of signalling errors. */ | 140 | /* This is kind of a dumb way of signalling errors. */ |
299 | if (KEY_DIRTY(&w->key)) { | 141 | if (KEY_DIRTY(&w->key)) { |
142 | int ret; | ||
300 | unsigned i; | 143 | unsigned i; |
301 | struct btree_op op; | 144 | struct keylist keys; |
302 | bch_btree_op_init_stack(&op); | ||
303 | 145 | ||
304 | op.type = BTREE_REPLACE; | 146 | bch_keylist_init(&keys); |
305 | bkey_copy(&op.replace, &w->key); | ||
306 | 147 | ||
307 | SET_KEY_DIRTY(&w->key, false); | 148 | bkey_copy(keys.top, &w->key); |
308 | bch_keylist_add(&op.keys, &w->key); | 149 | SET_KEY_DIRTY(keys.top, false); |
150 | bch_keylist_push(&keys); | ||
309 | 151 | ||
310 | for (i = 0; i < KEY_PTRS(&w->key); i++) | 152 | for (i = 0; i < KEY_PTRS(&w->key); i++) |
311 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); | 153 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); |
312 | 154 | ||
313 | bch_btree_insert(&op, dc->disk.c); | 155 | ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key); |
314 | closure_sync(&op.cl); | ||
315 | 156 | ||
316 | if (op.insert_collision) | 157 | if (ret) |
317 | trace_bcache_writeback_collision(&w->key); | 158 | trace_bcache_writeback_collision(&w->key); |
318 | 159 | ||
319 | atomic_long_inc(op.insert_collision | 160 | atomic_long_inc(ret |
320 | ? &dc->disk.c->writeback_keys_failed | 161 | ? &dc->disk.c->writeback_keys_failed |
321 | : &dc->disk.c->writeback_keys_done); | 162 | : &dc->disk.c->writeback_keys_done); |
322 | } | 163 | } |
@@ -374,30 +215,33 @@ static void read_dirty_submit(struct closure *cl) | |||
374 | continue_at(cl, write_dirty, system_wq); | 215 | continue_at(cl, write_dirty, system_wq); |
375 | } | 216 | } |
376 | 217 | ||
377 | static void read_dirty(struct closure *cl) | 218 | static void read_dirty(struct cached_dev *dc) |
378 | { | 219 | { |
379 | struct cached_dev *dc = container_of(cl, struct cached_dev, | 220 | unsigned delay = 0; |
380 | writeback.cl); | ||
381 | unsigned delay = writeback_delay(dc, 0); | ||
382 | struct keybuf_key *w; | 221 | struct keybuf_key *w; |
383 | struct dirty_io *io; | 222 | struct dirty_io *io; |
223 | struct closure cl; | ||
224 | |||
225 | closure_init_stack(&cl); | ||
384 | 226 | ||
385 | /* | 227 | /* |
386 | * XXX: if we error, background writeback just spins. Should use some | 228 | * XXX: if we error, background writeback just spins. Should use some |
387 | * mempools. | 229 | * mempools. |
388 | */ | 230 | */ |
389 | 231 | ||
390 | while (1) { | 232 | while (!kthread_should_stop()) { |
233 | try_to_freeze(); | ||
234 | |||
391 | w = bch_keybuf_next(&dc->writeback_keys); | 235 | w = bch_keybuf_next(&dc->writeback_keys); |
392 | if (!w) | 236 | if (!w) |
393 | break; | 237 | break; |
394 | 238 | ||
395 | BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); | 239 | BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); |
396 | 240 | ||
397 | if (delay > 0 && | 241 | if (KEY_START(&w->key) != dc->last_read || |
398 | (KEY_START(&w->key) != dc->last_read || | 242 | jiffies_to_msecs(delay) > 50) |
399 | jiffies_to_msecs(delay) > 50)) | 243 | while (!kthread_should_stop() && delay) |
400 | delay = schedule_timeout_uninterruptible(delay); | 244 | delay = schedule_timeout_interruptible(delay); |
401 | 245 | ||
402 | dc->last_read = KEY_OFFSET(&w->key); | 246 | dc->last_read = KEY_OFFSET(&w->key); |
403 | 247 | ||
@@ -423,7 +267,7 @@ static void read_dirty(struct closure *cl) | |||
423 | trace_bcache_writeback(&w->key); | 267 | trace_bcache_writeback(&w->key); |
424 | 268 | ||
425 | down(&dc->in_flight); | 269 | down(&dc->in_flight); |
426 | closure_call(&io->cl, read_dirty_submit, NULL, cl); | 270 | closure_call(&io->cl, read_dirty_submit, NULL, &cl); |
427 | 271 | ||
428 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); | 272 | delay = writeback_delay(dc, KEY_SIZE(&w->key)); |
429 | } | 273 | } |
@@ -439,52 +283,205 @@ err: | |||
439 | * Wait for outstanding writeback IOs to finish (and keybuf slots to be | 283 | * Wait for outstanding writeback IOs to finish (and keybuf slots to be |
440 | * freed) before refilling again | 284 | * freed) before refilling again |
441 | */ | 285 | */ |
442 | continue_at(cl, refill_dirty, dirty_wq); | 286 | closure_sync(&cl); |
443 | } | 287 | } |
444 | 288 | ||
445 | /* Init */ | 289 | /* Scan for dirty data */ |
290 | |||
291 | void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, | ||
292 | uint64_t offset, int nr_sectors) | ||
293 | { | ||
294 | struct bcache_device *d = c->devices[inode]; | ||
295 | unsigned stripe_offset, stripe, sectors_dirty; | ||
296 | |||
297 | if (!d) | ||
298 | return; | ||
299 | |||
300 | stripe = offset_to_stripe(d, offset); | ||
301 | stripe_offset = offset & (d->stripe_size - 1); | ||
302 | |||
303 | while (nr_sectors) { | ||
304 | int s = min_t(unsigned, abs(nr_sectors), | ||
305 | d->stripe_size - stripe_offset); | ||
306 | |||
307 | if (nr_sectors < 0) | ||
308 | s = -s; | ||
309 | |||
310 | if (stripe >= d->nr_stripes) | ||
311 | return; | ||
312 | |||
313 | sectors_dirty = atomic_add_return(s, | ||
314 | d->stripe_sectors_dirty + stripe); | ||
315 | if (sectors_dirty == d->stripe_size) | ||
316 | set_bit(stripe, d->full_dirty_stripes); | ||
317 | else | ||
318 | clear_bit(stripe, d->full_dirty_stripes); | ||
319 | |||
320 | nr_sectors -= s; | ||
321 | stripe_offset = 0; | ||
322 | stripe++; | ||
323 | } | ||
324 | } | ||
446 | 325 | ||
447 | static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, | 326 | static bool dirty_pred(struct keybuf *buf, struct bkey *k) |
448 | struct cached_dev *dc) | ||
449 | { | 327 | { |
450 | struct bkey *k; | 328 | return KEY_DIRTY(k); |
451 | struct btree_iter iter; | 329 | } |
452 | 330 | ||
453 | bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); | 331 | static void refill_full_stripes(struct cached_dev *dc) |
454 | while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) | 332 | { |
455 | if (!b->level) { | 333 | struct keybuf *buf = &dc->writeback_keys; |
456 | if (KEY_INODE(k) > dc->disk.id) | 334 | unsigned start_stripe, stripe, next_stripe; |
457 | break; | 335 | bool wrapped = false; |
458 | 336 | ||
459 | if (KEY_DIRTY(k)) | 337 | stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); |
460 | bcache_dev_sectors_dirty_add(b->c, dc->disk.id, | 338 | |
461 | KEY_START(k), | 339 | if (stripe >= dc->disk.nr_stripes) |
462 | KEY_SIZE(k)); | 340 | stripe = 0; |
463 | } else { | 341 | |
464 | btree(sectors_dirty_init, k, b, op, dc); | 342 | start_stripe = stripe; |
465 | if (KEY_INODE(k) > dc->disk.id) | 343 | |
466 | break; | 344 | while (1) { |
467 | 345 | stripe = find_next_bit(dc->disk.full_dirty_stripes, | |
468 | cond_resched(); | 346 | dc->disk.nr_stripes, stripe); |
347 | |||
348 | if (stripe == dc->disk.nr_stripes) | ||
349 | goto next; | ||
350 | |||
351 | next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, | ||
352 | dc->disk.nr_stripes, stripe); | ||
353 | |||
354 | buf->last_scanned = KEY(dc->disk.id, | ||
355 | stripe * dc->disk.stripe_size, 0); | ||
356 | |||
357 | bch_refill_keybuf(dc->disk.c, buf, | ||
358 | &KEY(dc->disk.id, | ||
359 | next_stripe * dc->disk.stripe_size, 0), | ||
360 | dirty_pred); | ||
361 | |||
362 | if (array_freelist_empty(&buf->freelist)) | ||
363 | return; | ||
364 | |||
365 | stripe = next_stripe; | ||
366 | next: | ||
367 | if (wrapped && stripe > start_stripe) | ||
368 | return; | ||
369 | |||
370 | if (stripe == dc->disk.nr_stripes) { | ||
371 | stripe = 0; | ||
372 | wrapped = true; | ||
469 | } | 373 | } |
374 | } | ||
375 | } | ||
376 | |||
377 | static bool refill_dirty(struct cached_dev *dc) | ||
378 | { | ||
379 | struct keybuf *buf = &dc->writeback_keys; | ||
380 | struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); | ||
381 | bool searched_from_start = false; | ||
382 | |||
383 | if (dc->partial_stripes_expensive) { | ||
384 | refill_full_stripes(dc); | ||
385 | if (array_freelist_empty(&buf->freelist)) | ||
386 | return false; | ||
387 | } | ||
388 | |||
389 | if (bkey_cmp(&buf->last_scanned, &end) >= 0) { | ||
390 | buf->last_scanned = KEY(dc->disk.id, 0, 0); | ||
391 | searched_from_start = true; | ||
392 | } | ||
393 | |||
394 | bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); | ||
395 | |||
396 | return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; | ||
397 | } | ||
398 | |||
399 | static int bch_writeback_thread(void *arg) | ||
400 | { | ||
401 | struct cached_dev *dc = arg; | ||
402 | bool searched_full_index; | ||
403 | |||
404 | while (!kthread_should_stop()) { | ||
405 | down_write(&dc->writeback_lock); | ||
406 | if (!atomic_read(&dc->has_dirty) || | ||
407 | (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && | ||
408 | !dc->writeback_running)) { | ||
409 | up_write(&dc->writeback_lock); | ||
410 | set_current_state(TASK_INTERRUPTIBLE); | ||
411 | |||
412 | if (kthread_should_stop()) | ||
413 | return 0; | ||
414 | |||
415 | try_to_freeze(); | ||
416 | schedule(); | ||
417 | continue; | ||
418 | } | ||
419 | |||
420 | searched_full_index = refill_dirty(dc); | ||
421 | |||
422 | if (searched_full_index && | ||
423 | RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { | ||
424 | atomic_set(&dc->has_dirty, 0); | ||
425 | cached_dev_put(dc); | ||
426 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); | ||
427 | bch_write_bdev_super(dc, NULL); | ||
428 | } | ||
429 | |||
430 | up_write(&dc->writeback_lock); | ||
431 | |||
432 | bch_ratelimit_reset(&dc->writeback_rate); | ||
433 | read_dirty(dc); | ||
434 | |||
435 | if (searched_full_index) { | ||
436 | unsigned delay = dc->writeback_delay * HZ; | ||
437 | |||
438 | while (delay && | ||
439 | !kthread_should_stop() && | ||
440 | !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) | ||
441 | delay = schedule_timeout_interruptible(delay); | ||
442 | } | ||
443 | } | ||
470 | 444 | ||
471 | return 0; | 445 | return 0; |
472 | } | 446 | } |
473 | 447 | ||
448 | /* Init */ | ||
449 | |||
450 | struct sectors_dirty_init { | ||
451 | struct btree_op op; | ||
452 | unsigned inode; | ||
453 | }; | ||
454 | |||
455 | static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, | ||
456 | struct bkey *k) | ||
457 | { | ||
458 | struct sectors_dirty_init *op = container_of(_op, | ||
459 | struct sectors_dirty_init, op); | ||
460 | if (KEY_INODE(k) > op->inode) | ||
461 | return MAP_DONE; | ||
462 | |||
463 | if (KEY_DIRTY(k)) | ||
464 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | ||
465 | KEY_START(k), KEY_SIZE(k)); | ||
466 | |||
467 | return MAP_CONTINUE; | ||
468 | } | ||
469 | |||
474 | void bch_sectors_dirty_init(struct cached_dev *dc) | 470 | void bch_sectors_dirty_init(struct cached_dev *dc) |
475 | { | 471 | { |
476 | struct btree_op op; | 472 | struct sectors_dirty_init op; |
473 | |||
474 | bch_btree_op_init(&op.op, -1); | ||
475 | op.inode = dc->disk.id; | ||
477 | 476 | ||
478 | bch_btree_op_init_stack(&op); | 477 | bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), |
479 | btree_root(sectors_dirty_init, dc->disk.c, &op, dc); | 478 | sectors_dirty_init_fn, 0); |
480 | } | 479 | } |
481 | 480 | ||
482 | void bch_cached_dev_writeback_init(struct cached_dev *dc) | 481 | int bch_cached_dev_writeback_init(struct cached_dev *dc) |
483 | { | 482 | { |
484 | sema_init(&dc->in_flight, 64); | 483 | sema_init(&dc->in_flight, 64); |
485 | closure_init_unlocked(&dc->writeback); | ||
486 | init_rwsem(&dc->writeback_lock); | 484 | init_rwsem(&dc->writeback_lock); |
487 | |||
488 | bch_keybuf_init(&dc->writeback_keys); | 485 | bch_keybuf_init(&dc->writeback_keys); |
489 | 486 | ||
490 | dc->writeback_metadata = true; | 487 | dc->writeback_metadata = true; |
@@ -498,22 +495,16 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) | |||
498 | dc->writeback_rate_p_term_inverse = 64; | 495 | dc->writeback_rate_p_term_inverse = 64; |
499 | dc->writeback_rate_d_smooth = 8; | 496 | dc->writeback_rate_d_smooth = 8; |
500 | 497 | ||
498 | dc->writeback_thread = kthread_create(bch_writeback_thread, dc, | ||
499 | "bcache_writeback"); | ||
500 | if (IS_ERR(dc->writeback_thread)) | ||
501 | return PTR_ERR(dc->writeback_thread); | ||
502 | |||
503 | set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE); | ||
504 | |||
501 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); | 505 | INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); |
502 | schedule_delayed_work(&dc->writeback_rate_update, | 506 | schedule_delayed_work(&dc->writeback_rate_update, |
503 | dc->writeback_rate_update_seconds * HZ); | 507 | dc->writeback_rate_update_seconds * HZ); |
504 | } | ||
505 | |||
506 | void bch_writeback_exit(void) | ||
507 | { | ||
508 | if (dirty_wq) | ||
509 | destroy_workqueue(dirty_wq); | ||
510 | } | ||
511 | |||
512 | int __init bch_writeback_init(void) | ||
513 | { | ||
514 | dirty_wq = create_workqueue("bcache_writeback"); | ||
515 | if (!dirty_wq) | ||
516 | return -ENOMEM; | ||
517 | 508 | ||
518 | return 0; | 509 | return 0; |
519 | } | 510 | } |
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index c91f61bb95b6..c9ddcf4614b9 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h | |||
@@ -14,20 +14,27 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) | |||
14 | return ret; | 14 | return ret; |
15 | } | 15 | } |
16 | 16 | ||
17 | static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, | 17 | static inline unsigned offset_to_stripe(struct bcache_device *d, |
18 | uint64_t offset) | ||
19 | { | ||
20 | do_div(offset, d->stripe_size); | ||
21 | return offset; | ||
22 | } | ||
23 | |||
24 | static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, | ||
18 | uint64_t offset, | 25 | uint64_t offset, |
19 | unsigned nr_sectors) | 26 | unsigned nr_sectors) |
20 | { | 27 | { |
21 | uint64_t stripe = offset >> d->stripe_size_bits; | 28 | unsigned stripe = offset_to_stripe(&dc->disk, offset); |
22 | 29 | ||
23 | while (1) { | 30 | while (1) { |
24 | if (atomic_read(d->stripe_sectors_dirty + stripe)) | 31 | if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) |
25 | return true; | 32 | return true; |
26 | 33 | ||
27 | if (nr_sectors <= 1 << d->stripe_size_bits) | 34 | if (nr_sectors <= dc->disk.stripe_size) |
28 | return false; | 35 | return false; |
29 | 36 | ||
30 | nr_sectors -= 1 << d->stripe_size_bits; | 37 | nr_sectors -= dc->disk.stripe_size; |
31 | stripe++; | 38 | stripe++; |
32 | } | 39 | } |
33 | } | 40 | } |
@@ -38,12 +45,12 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, | |||
38 | unsigned in_use = dc->disk.c->gc_stats.in_use; | 45 | unsigned in_use = dc->disk.c->gc_stats.in_use; |
39 | 46 | ||
40 | if (cache_mode != CACHE_MODE_WRITEBACK || | 47 | if (cache_mode != CACHE_MODE_WRITEBACK || |
41 | atomic_read(&dc->disk.detaching) || | 48 | test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || |
42 | in_use > CUTOFF_WRITEBACK_SYNC) | 49 | in_use > CUTOFF_WRITEBACK_SYNC) |
43 | return false; | 50 | return false; |
44 | 51 | ||
45 | if (dc->partial_stripes_expensive && | 52 | if (dc->partial_stripes_expensive && |
46 | bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector, | 53 | bcache_dev_stripe_dirty(dc, bio->bi_sector, |
47 | bio_sectors(bio))) | 54 | bio_sectors(bio))) |
48 | return true; | 55 | return true; |
49 | 56 | ||
@@ -54,11 +61,30 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, | |||
54 | in_use <= CUTOFF_WRITEBACK; | 61 | in_use <= CUTOFF_WRITEBACK; |
55 | } | 62 | } |
56 | 63 | ||
64 | static inline void bch_writeback_queue(struct cached_dev *dc) | ||
65 | { | ||
66 | wake_up_process(dc->writeback_thread); | ||
67 | } | ||
68 | |||
69 | static inline void bch_writeback_add(struct cached_dev *dc) | ||
70 | { | ||
71 | if (!atomic_read(&dc->has_dirty) && | ||
72 | !atomic_xchg(&dc->has_dirty, 1)) { | ||
73 | atomic_inc(&dc->count); | ||
74 | |||
75 | if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { | ||
76 | SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); | ||
77 | /* XXX: should do this synchronously */ | ||
78 | bch_write_bdev_super(dc, NULL); | ||
79 | } | ||
80 | |||
81 | bch_writeback_queue(dc); | ||
82 | } | ||
83 | } | ||
84 | |||
57 | void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); | 85 | void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); |
58 | void bch_writeback_queue(struct cached_dev *); | ||
59 | void bch_writeback_add(struct cached_dev *); | ||
60 | 86 | ||
61 | void bch_sectors_dirty_init(struct cached_dev *dc); | 87 | void bch_sectors_dirty_init(struct cached_dev *dc); |
62 | void bch_cached_dev_writeback_init(struct cached_dev *); | 88 | int bch_cached_dev_writeback_init(struct cached_dev *); |
63 | 89 | ||
64 | #endif | 90 | #endif |
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 451bf99582ff..846d5c6609d8 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c | |||
@@ -2978,12 +2978,12 @@ static int dasd_alloc_queue(struct dasd_block *block) | |||
2978 | 2978 | ||
2979 | elevator_exit(block->request_queue->elevator); | 2979 | elevator_exit(block->request_queue->elevator); |
2980 | block->request_queue->elevator = NULL; | 2980 | block->request_queue->elevator = NULL; |
2981 | mutex_lock(&block->request_queue->sysfs_lock); | ||
2981 | rc = elevator_init(block->request_queue, "deadline"); | 2982 | rc = elevator_init(block->request_queue, "deadline"); |
2982 | if (rc) { | 2983 | if (rc) |
2983 | blk_cleanup_queue(block->request_queue); | 2984 | blk_cleanup_queue(block->request_queue); |
2984 | return rc; | 2985 | mutex_unlock(&block->request_queue->sysfs_lock); |
2985 | } | 2986 | return rc; |
2986 | return 0; | ||
2987 | } | 2987 | } |
2988 | 2988 | ||
2989 | /* | 2989 | /* |
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 5ebda976ea93..e2b9576d00e2 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h | |||
@@ -6,11 +6,9 @@ | |||
6 | 6 | ||
7 | #include <linux/tracepoint.h> | 7 | #include <linux/tracepoint.h> |
8 | 8 | ||
9 | struct search; | ||
10 | |||
11 | DECLARE_EVENT_CLASS(bcache_request, | 9 | DECLARE_EVENT_CLASS(bcache_request, |
12 | TP_PROTO(struct search *s, struct bio *bio), | 10 | TP_PROTO(struct bcache_device *d, struct bio *bio), |
13 | TP_ARGS(s, bio), | 11 | TP_ARGS(d, bio), |
14 | 12 | ||
15 | TP_STRUCT__entry( | 13 | TP_STRUCT__entry( |
16 | __field(dev_t, dev ) | 14 | __field(dev_t, dev ) |
@@ -24,8 +22,8 @@ DECLARE_EVENT_CLASS(bcache_request, | |||
24 | 22 | ||
25 | TP_fast_assign( | 23 | TP_fast_assign( |
26 | __entry->dev = bio->bi_bdev->bd_dev; | 24 | __entry->dev = bio->bi_bdev->bd_dev; |
27 | __entry->orig_major = s->d->disk->major; | 25 | __entry->orig_major = d->disk->major; |
28 | __entry->orig_minor = s->d->disk->first_minor; | 26 | __entry->orig_minor = d->disk->first_minor; |
29 | __entry->sector = bio->bi_sector; | 27 | __entry->sector = bio->bi_sector; |
30 | __entry->orig_sector = bio->bi_sector - 16; | 28 | __entry->orig_sector = bio->bi_sector - 16; |
31 | __entry->nr_sector = bio->bi_size >> 9; | 29 | __entry->nr_sector = bio->bi_size >> 9; |
@@ -79,13 +77,13 @@ DECLARE_EVENT_CLASS(btree_node, | |||
79 | /* request.c */ | 77 | /* request.c */ |
80 | 78 | ||
81 | DEFINE_EVENT(bcache_request, bcache_request_start, | 79 | DEFINE_EVENT(bcache_request, bcache_request_start, |
82 | TP_PROTO(struct search *s, struct bio *bio), | 80 | TP_PROTO(struct bcache_device *d, struct bio *bio), |
83 | TP_ARGS(s, bio) | 81 | TP_ARGS(d, bio) |
84 | ); | 82 | ); |
85 | 83 | ||
86 | DEFINE_EVENT(bcache_request, bcache_request_end, | 84 | DEFINE_EVENT(bcache_request, bcache_request_end, |
87 | TP_PROTO(struct search *s, struct bio *bio), | 85 | TP_PROTO(struct bcache_device *d, struct bio *bio), |
88 | TP_ARGS(s, bio) | 86 | TP_ARGS(d, bio) |
89 | ); | 87 | ); |
90 | 88 | ||
91 | DECLARE_EVENT_CLASS(bcache_bio, | 89 | DECLARE_EVENT_CLASS(bcache_bio, |
@@ -370,6 +368,35 @@ DEFINE_EVENT(btree_node, bcache_btree_set_root, | |||
370 | TP_ARGS(b) | 368 | TP_ARGS(b) |
371 | ); | 369 | ); |
372 | 370 | ||
371 | TRACE_EVENT(bcache_keyscan, | ||
372 | TP_PROTO(unsigned nr_found, | ||
373 | unsigned start_inode, uint64_t start_offset, | ||
374 | unsigned end_inode, uint64_t end_offset), | ||
375 | TP_ARGS(nr_found, | ||
376 | start_inode, start_offset, | ||
377 | end_inode, end_offset), | ||
378 | |||
379 | TP_STRUCT__entry( | ||
380 | __field(__u32, nr_found ) | ||
381 | __field(__u32, start_inode ) | ||
382 | __field(__u64, start_offset ) | ||
383 | __field(__u32, end_inode ) | ||
384 | __field(__u64, end_offset ) | ||
385 | ), | ||
386 | |||
387 | TP_fast_assign( | ||
388 | __entry->nr_found = nr_found; | ||
389 | __entry->start_inode = start_inode; | ||
390 | __entry->start_offset = start_offset; | ||
391 | __entry->end_inode = end_inode; | ||
392 | __entry->end_offset = end_offset; | ||
393 | ), | ||
394 | |||
395 | TP_printk("found %u keys from %u:%llu to %u:%llu", __entry->nr_found, | ||
396 | __entry->start_inode, __entry->start_offset, | ||
397 | __entry->end_inode, __entry->end_offset) | ||
398 | ); | ||
399 | |||
373 | /* Allocator */ | 400 | /* Allocator */ |
374 | 401 | ||
375 | TRACE_EVENT(bcache_alloc_invalidate, | 402 | TRACE_EVENT(bcache_alloc_invalidate, |
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h new file mode 100644 index 000000000000..164a7e263988 --- /dev/null +++ b/include/uapi/linux/bcache.h | |||
@@ -0,0 +1,373 @@ | |||
1 | #ifndef _LINUX_BCACHE_H | ||
2 | #define _LINUX_BCACHE_H | ||
3 | |||
4 | /* | ||
5 | * Bcache on disk data structures | ||
6 | */ | ||
7 | |||
8 | #include <asm/types.h> | ||
9 | |||
10 | #define BITMASK(name, type, field, offset, size) \ | ||
11 | static inline __u64 name(const type *k) \ | ||
12 | { return (k->field >> offset) & ~(~0ULL << size); } \ | ||
13 | \ | ||
14 | static inline void SET_##name(type *k, __u64 v) \ | ||
15 | { \ | ||
16 | k->field &= ~(~(~0ULL << size) << offset); \ | ||
17 | k->field |= (v & ~(~0ULL << size)) << offset; \ | ||
18 | } | ||
19 | |||
20 | /* Btree keys - all units are in sectors */ | ||
21 | |||
22 | struct bkey { | ||
23 | __u64 high; | ||
24 | __u64 low; | ||
25 | __u64 ptr[]; | ||
26 | }; | ||
27 | |||
28 | #define KEY_FIELD(name, field, offset, size) \ | ||
29 | BITMASK(name, struct bkey, field, offset, size) | ||
30 | |||
31 | #define PTR_FIELD(name, offset, size) \ | ||
32 | static inline __u64 name(const struct bkey *k, unsigned i) \ | ||
33 | { return (k->ptr[i] >> offset) & ~(~0ULL << size); } \ | ||
34 | \ | ||
35 | static inline void SET_##name(struct bkey *k, unsigned i, __u64 v) \ | ||
36 | { \ | ||
37 | k->ptr[i] &= ~(~(~0ULL << size) << offset); \ | ||
38 | k->ptr[i] |= (v & ~(~0ULL << size)) << offset; \ | ||
39 | } | ||
40 | |||
41 | #define KEY_SIZE_BITS 16 | ||
42 | |||
43 | KEY_FIELD(KEY_PTRS, high, 60, 3) | ||
44 | KEY_FIELD(HEADER_SIZE, high, 58, 2) | ||
45 | KEY_FIELD(KEY_CSUM, high, 56, 2) | ||
46 | KEY_FIELD(KEY_PINNED, high, 55, 1) | ||
47 | KEY_FIELD(KEY_DIRTY, high, 36, 1) | ||
48 | |||
49 | KEY_FIELD(KEY_SIZE, high, 20, KEY_SIZE_BITS) | ||
50 | KEY_FIELD(KEY_INODE, high, 0, 20) | ||
51 | |||
52 | /* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ | ||
53 | |||
54 | static inline __u64 KEY_OFFSET(const struct bkey *k) | ||
55 | { | ||
56 | return k->low; | ||
57 | } | ||
58 | |||
59 | static inline void SET_KEY_OFFSET(struct bkey *k, __u64 v) | ||
60 | { | ||
61 | k->low = v; | ||
62 | } | ||
63 | |||
64 | /* | ||
65 | * The high bit being set is a relic from when we used it to do binary | ||
66 | * searches - it told you where a key started. It's not used anymore, | ||
67 | * and can probably be safely dropped. | ||
68 | */ | ||
69 | #define KEY(inode, offset, size) \ | ||
70 | ((struct bkey) { \ | ||
71 | .high = (1ULL << 63) | ((__u64) (size) << 20) | (inode), \ | ||
72 | .low = (offset) \ | ||
73 | }) | ||
74 | |||
75 | #define ZERO_KEY KEY(0, 0, 0) | ||
76 | |||
77 | #define MAX_KEY_INODE (~(~0 << 20)) | ||
78 | #define MAX_KEY_OFFSET (~0ULL >> 1) | ||
79 | #define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0) | ||
80 | |||
81 | #define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) | ||
82 | #define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) | ||
83 | |||
84 | #define PTR_DEV_BITS 12 | ||
85 | |||
86 | PTR_FIELD(PTR_DEV, 51, PTR_DEV_BITS) | ||
87 | PTR_FIELD(PTR_OFFSET, 8, 43) | ||
88 | PTR_FIELD(PTR_GEN, 0, 8) | ||
89 | |||
90 | #define PTR_CHECK_DEV ((1 << PTR_DEV_BITS) - 1) | ||
91 | |||
92 | #define PTR(gen, offset, dev) \ | ||
93 | ((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen) | ||
94 | |||
95 | /* Bkey utility code */ | ||
96 | |||
97 | static inline unsigned long bkey_u64s(const struct bkey *k) | ||
98 | { | ||
99 | return (sizeof(struct bkey) / sizeof(__u64)) + KEY_PTRS(k); | ||
100 | } | ||
101 | |||
102 | static inline unsigned long bkey_bytes(const struct bkey *k) | ||
103 | { | ||
104 | return bkey_u64s(k) * sizeof(__u64); | ||
105 | } | ||
106 | |||
107 | #define bkey_copy(_dest, _src) memcpy(_dest, _src, bkey_bytes(_src)) | ||
108 | |||
109 | static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) | ||
110 | { | ||
111 | SET_KEY_INODE(dest, KEY_INODE(src)); | ||
112 | SET_KEY_OFFSET(dest, KEY_OFFSET(src)); | ||
113 | } | ||
114 | |||
115 | static inline struct bkey *bkey_next(const struct bkey *k) | ||
116 | { | ||
117 | __u64 *d = (void *) k; | ||
118 | return (struct bkey *) (d + bkey_u64s(k)); | ||
119 | } | ||
120 | |||
121 | static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys) | ||
122 | { | ||
123 | __u64 *d = (void *) k; | ||
124 | return (struct bkey *) (d + nr_keys); | ||
125 | } | ||
126 | /* Enough for a key with 6 pointers */ | ||
127 | #define BKEY_PAD 8 | ||
128 | |||
129 | #define BKEY_PADDED(key) \ | ||
130 | union { struct bkey key; __u64 key ## _pad[BKEY_PAD]; } | ||
131 | |||
132 | /* Superblock */ | ||
133 | |||
134 | /* Version 0: Cache device | ||
135 | * Version 1: Backing device | ||
136 | * Version 2: Seed pointer into btree node checksum | ||
137 | * Version 3: Cache device with new UUID format | ||
138 | * Version 4: Backing device with data offset | ||
139 | */ | ||
140 | #define BCACHE_SB_VERSION_CDEV 0 | ||
141 | #define BCACHE_SB_VERSION_BDEV 1 | ||
142 | #define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 | ||
143 | #define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 | ||
144 | #define BCACHE_SB_MAX_VERSION 4 | ||
145 | |||
146 | #define SB_SECTOR 8 | ||
147 | #define SB_SIZE 4096 | ||
148 | #define SB_LABEL_SIZE 32 | ||
149 | #define SB_JOURNAL_BUCKETS 256U | ||
150 | /* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ | ||
151 | #define MAX_CACHES_PER_SET 8 | ||
152 | |||
153 | #define BDEV_DATA_START_DEFAULT 16 /* sectors */ | ||
154 | |||
155 | struct cache_sb { | ||
156 | __u64 csum; | ||
157 | __u64 offset; /* sector where this sb was written */ | ||
158 | __u64 version; | ||
159 | |||
160 | __u8 magic[16]; | ||
161 | |||
162 | __u8 uuid[16]; | ||
163 | union { | ||
164 | __u8 set_uuid[16]; | ||
165 | __u64 set_magic; | ||
166 | }; | ||
167 | __u8 label[SB_LABEL_SIZE]; | ||
168 | |||
169 | __u64 flags; | ||
170 | __u64 seq; | ||
171 | __u64 pad[8]; | ||
172 | |||
173 | union { | ||
174 | struct { | ||
175 | /* Cache devices */ | ||
176 | __u64 nbuckets; /* device size */ | ||
177 | |||
178 | __u16 block_size; /* sectors */ | ||
179 | __u16 bucket_size; /* sectors */ | ||
180 | |||
181 | __u16 nr_in_set; | ||
182 | __u16 nr_this_dev; | ||
183 | }; | ||
184 | struct { | ||
185 | /* Backing devices */ | ||
186 | __u64 data_offset; | ||
187 | |||
188 | /* | ||
189 | * block_size from the cache device section is still used by | ||
190 | * backing devices, so don't add anything here until we fix | ||
191 | * things to not need it for backing devices anymore | ||
192 | */ | ||
193 | }; | ||
194 | }; | ||
195 | |||
196 | __u32 last_mount; /* time_t */ | ||
197 | |||
198 | __u16 first_bucket; | ||
199 | union { | ||
200 | __u16 njournal_buckets; | ||
201 | __u16 keys; | ||
202 | }; | ||
203 | __u64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ | ||
204 | }; | ||
205 | |||
206 | static inline _Bool SB_IS_BDEV(const struct cache_sb *sb) | ||
207 | { | ||
208 | return sb->version == BCACHE_SB_VERSION_BDEV | ||
209 | || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; | ||
210 | } | ||
211 | |||
212 | BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); | ||
213 | BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); | ||
214 | BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); | ||
215 | #define CACHE_REPLACEMENT_LRU 0U | ||
216 | #define CACHE_REPLACEMENT_FIFO 1U | ||
217 | #define CACHE_REPLACEMENT_RANDOM 2U | ||
218 | |||
219 | BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); | ||
220 | #define CACHE_MODE_WRITETHROUGH 0U | ||
221 | #define CACHE_MODE_WRITEBACK 1U | ||
222 | #define CACHE_MODE_WRITEAROUND 2U | ||
223 | #define CACHE_MODE_NONE 3U | ||
224 | BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); | ||
225 | #define BDEV_STATE_NONE 0U | ||
226 | #define BDEV_STATE_CLEAN 1U | ||
227 | #define BDEV_STATE_DIRTY 2U | ||
228 | #define BDEV_STATE_STALE 3U | ||
229 | |||
230 | /* | ||
231 | * Magic numbers | ||
232 | * | ||
233 | * The various other data structures have their own magic numbers, which are | ||
234 | * xored with the first part of the cache set's UUID | ||
235 | */ | ||
236 | |||
237 | #define JSET_MAGIC 0x245235c1a3625032ULL | ||
238 | #define PSET_MAGIC 0x6750e15f87337f91ULL | ||
239 | #define BSET_MAGIC 0x90135c78b99e07f5ULL | ||
240 | |||
241 | static inline __u64 jset_magic(struct cache_sb *sb) | ||
242 | { | ||
243 | return sb->set_magic ^ JSET_MAGIC; | ||
244 | } | ||
245 | |||
246 | static inline __u64 pset_magic(struct cache_sb *sb) | ||
247 | { | ||
248 | return sb->set_magic ^ PSET_MAGIC; | ||
249 | } | ||
250 | |||
251 | static inline __u64 bset_magic(struct cache_sb *sb) | ||
252 | { | ||
253 | return sb->set_magic ^ BSET_MAGIC; | ||
254 | } | ||
255 | |||
256 | /* | ||
257 | * Journal | ||
258 | * | ||
259 | * On disk format for a journal entry: | ||
260 | * seq is monotonically increasing; every journal entry has its own unique | ||
261 | * sequence number. | ||
262 | * | ||
263 | * last_seq is the oldest journal entry that still has keys the btree hasn't | ||
264 | * flushed to disk yet. | ||
265 | * | ||
266 | * version is for on disk format changes. | ||
267 | */ | ||
268 | |||
269 | #define BCACHE_JSET_VERSION_UUIDv1 1 | ||
270 | #define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */ | ||
271 | #define BCACHE_JSET_VERSION 1 | ||
272 | |||
273 | struct jset { | ||
274 | __u64 csum; | ||
275 | __u64 magic; | ||
276 | __u64 seq; | ||
277 | __u32 version; | ||
278 | __u32 keys; | ||
279 | |||
280 | __u64 last_seq; | ||
281 | |||
282 | BKEY_PADDED(uuid_bucket); | ||
283 | BKEY_PADDED(btree_root); | ||
284 | __u16 btree_level; | ||
285 | __u16 pad[3]; | ||
286 | |||
287 | __u64 prio_bucket[MAX_CACHES_PER_SET]; | ||
288 | |||
289 | union { | ||
290 | struct bkey start[0]; | ||
291 | __u64 d[0]; | ||
292 | }; | ||
293 | }; | ||
294 | |||
295 | /* Bucket prios/gens */ | ||
296 | |||
297 | struct prio_set { | ||
298 | __u64 csum; | ||
299 | __u64 magic; | ||
300 | __u64 seq; | ||
301 | __u32 version; | ||
302 | __u32 pad; | ||
303 | |||
304 | __u64 next_bucket; | ||
305 | |||
306 | struct bucket_disk { | ||
307 | __u16 prio; | ||
308 | __u8 gen; | ||
309 | } __attribute((packed)) data[]; | ||
310 | }; | ||
311 | |||
312 | /* UUIDS - per backing device/flash only volume metadata */ | ||
313 | |||
314 | struct uuid_entry { | ||
315 | union { | ||
316 | struct { | ||
317 | __u8 uuid[16]; | ||
318 | __u8 label[32]; | ||
319 | __u32 first_reg; | ||
320 | __u32 last_reg; | ||
321 | __u32 invalidated; | ||
322 | |||
323 | __u32 flags; | ||
324 | /* Size of flash only volumes */ | ||
325 | __u64 sectors; | ||
326 | }; | ||
327 | |||
328 | __u8 pad[128]; | ||
329 | }; | ||
330 | }; | ||
331 | |||
332 | BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); | ||
333 | |||
334 | /* Btree nodes */ | ||
335 | |||
336 | /* Version 1: Seed pointer into btree node checksum | ||
337 | */ | ||
338 | #define BCACHE_BSET_CSUM 1 | ||
339 | #define BCACHE_BSET_VERSION 1 | ||
340 | |||
341 | /* | ||
342 | * Btree nodes | ||
343 | * | ||
344 | * On disk a btree node is a list/log of these; within each set the keys are | ||
345 | * sorted | ||
346 | */ | ||
347 | struct bset { | ||
348 | __u64 csum; | ||
349 | __u64 magic; | ||
350 | __u64 seq; | ||
351 | __u32 version; | ||
352 | __u32 keys; | ||
353 | |||
354 | union { | ||
355 | struct bkey start[0]; | ||
356 | __u64 d[0]; | ||
357 | }; | ||
358 | }; | ||
359 | |||
360 | /* OBSOLETE */ | ||
361 | |||
362 | /* UUIDS - per backing device/flash only volume metadata */ | ||
363 | |||
364 | struct uuid_entry_v0 { | ||
365 | __u8 uuid[16]; | ||
366 | __u8 label[32]; | ||
367 | __u32 first_reg; | ||
368 | __u32 last_reg; | ||
369 | __u32 invalidated; | ||
370 | __u32 pad; | ||
371 | }; | ||
372 | |||
373 | #endif /* _LINUX_BCACHE_H */ | ||