diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-22 22:02:52 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-07-22 22:02:52 -0400 |
commit | d4c90b1b9fe907da0d310008e5a769b591a14399 (patch) | |
tree | d37589ab70ada2778d315a0ad24d6e68c8615af6 | |
parent | 3b2f64d00c46e1e4e9bd0bb9bb12619adac27a4b (diff) | |
parent | 0878ae2db83a10894724cdeaba7ef9f1ac1c9ac8 (diff) |
Merge branch 'for-3.11/drivers' of git://git.kernel.dk/linux-block
Pull block IO driver bits from Jens Axboe:
"As I mentioned in the core block pull request, due to real life
circumstances the driver pull request would be late. Now it looks
like -rc2 late... On the plus side, apart form the rsxx update, these
are all things that I could argue could go in later in the cycle as
they are fixes and not features. So even though things are late, it's
not ALL bad.
The pull request contains:
- Updates to bcache, all bug fixes, from Kent.
- A pile of drbd bug fixes (no big features this time!).
- xen blk front/back fixes.
- rsxx driver updates, some of them deferred form 3.10. So should be
well cooked by now"
* 'for-3.11/drivers' of git://git.kernel.dk/linux-block: (63 commits)
bcache: Allocation kthread fixes
bcache: Fix GC_SECTORS_USED() calculation
bcache: Journal replay fix
bcache: Shutdown fix
bcache: Fix a sysfs splat on shutdown
bcache: Advertise that flushes are supported
bcache: check for allocation failures
bcache: Fix a dumb race
bcache: Use standard utility code
bcache: Update email address
bcache: Delete fuzz tester
bcache: Document shrinker reserve better
bcache: FUA fixes
drbd: Allow online change of al-stripes and al-stripe-size
drbd: Constants should be UPPERCASE
drbd: Ignore the exit code of a fence-peer handler if it returns too late
drbd: Fix rcu_read_lock balance on error path
drbd: fix error return code in drbd_init()
drbd: Do not sleep inside rcu
bcache: Refresh usage docs
...
47 files changed, 3183 insertions, 1550 deletions
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkback b/Documentation/ABI/testing/sysfs-driver-xen-blkback new file mode 100644 index 000000000000..8bb43b66eb55 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-driver-xen-blkback | |||
@@ -0,0 +1,17 @@ | |||
1 | What: /sys/module/xen_blkback/parameters/max_buffer_pages | ||
2 | Date: March 2013 | ||
3 | KernelVersion: 3.11 | ||
4 | Contact: Roger Pau Monné <roger.pau@citrix.com> | ||
5 | Description: | ||
6 | Maximum number of free pages to keep in each block | ||
7 | backend buffer. | ||
8 | |||
9 | What: /sys/module/xen_blkback/parameters/max_persistent_grants | ||
10 | Date: March 2013 | ||
11 | KernelVersion: 3.11 | ||
12 | Contact: Roger Pau Monné <roger.pau@citrix.com> | ||
13 | Description: | ||
14 | Maximum number of grants to map persistently in | ||
15 | blkback. If the frontend tries to use more than | ||
16 | max_persistent_grants, the LRU kicks in and starts | ||
17 | removing 5% of max_persistent_grants every 100ms. | ||
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkfront b/Documentation/ABI/testing/sysfs-driver-xen-blkfront new file mode 100644 index 000000000000..c0a6cb7eb314 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-driver-xen-blkfront | |||
@@ -0,0 +1,10 @@ | |||
1 | What: /sys/module/xen_blkfront/parameters/max | ||
2 | Date: June 2013 | ||
3 | KernelVersion: 3.11 | ||
4 | Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> | ||
5 | Description: | ||
6 | Maximum number of segments that the frontend will negotiate | ||
7 | with the backend for indirect descriptors. The default value | ||
8 | is 32 - higher value means more potential throughput but more | ||
9 | memory usage. The backend picks the minimum of the frontend | ||
10 | and its default backend value. | ||
diff --git a/Documentation/bcache.txt b/Documentation/bcache.txt index c3365f26b2d9..32b6c3189d98 100644 --- a/Documentation/bcache.txt +++ b/Documentation/bcache.txt | |||
@@ -46,29 +46,33 @@ you format your backing devices and cache device at the same time, you won't | |||
46 | have to manually attach: | 46 | have to manually attach: |
47 | make-bcache -B /dev/sda /dev/sdb -C /dev/sdc | 47 | make-bcache -B /dev/sda /dev/sdb -C /dev/sdc |
48 | 48 | ||
49 | To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register: | 49 | bcache-tools now ships udev rules, and bcache devices are known to the kernel |
50 | immediately. Without udev, you can manually register devices like this: | ||
50 | 51 | ||
51 | echo /dev/sdb > /sys/fs/bcache/register | 52 | echo /dev/sdb > /sys/fs/bcache/register |
52 | echo /dev/sdc > /sys/fs/bcache/register | 53 | echo /dev/sdc > /sys/fs/bcache/register |
53 | 54 | ||
54 | To register your bcache devices automatically, you could add something like | 55 | Registering the backing device makes the bcache device show up in /dev; you can |
55 | this to an init script: | 56 | now format it and use it as normal. But the first time using a new bcache |
57 | device, it'll be running in passthrough mode until you attach it to a cache. | ||
58 | See the section on attaching. | ||
56 | 59 | ||
57 | echo /dev/sd* > /sys/fs/bcache/register_quiet | 60 | The devices show up as: |
58 | 61 | ||
59 | It'll look for bcache superblocks and ignore everything that doesn't have one. | 62 | /dev/bcache<N> |
60 | 63 | ||
61 | Registering the backing device makes the bcache show up in /dev; you can now | 64 | As well as (with udev): |
62 | format it and use it as normal. But the first time using a new bcache device, | ||
63 | it'll be running in passthrough mode until you attach it to a cache. See the | ||
64 | section on attaching. | ||
65 | 65 | ||
66 | The devices show up at /dev/bcacheN, and can be controlled via sysfs from | 66 | /dev/bcache/by-uuid/<uuid> |
67 | /sys/block/bcacheN/bcache: | 67 | /dev/bcache/by-label/<label> |
68 | |||
69 | To get started: | ||
68 | 70 | ||
69 | mkfs.ext4 /dev/bcache0 | 71 | mkfs.ext4 /dev/bcache0 |
70 | mount /dev/bcache0 /mnt | 72 | mount /dev/bcache0 /mnt |
71 | 73 | ||
74 | You can control bcache devices through sysfs at /sys/block/bcache<N>/bcache . | ||
75 | |||
72 | Cache devices are managed as sets; multiple caches per set isn't supported yet | 76 | Cache devices are managed as sets; multiple caches per set isn't supported yet |
73 | but will allow for mirroring of metadata and dirty data in the future. Your new | 77 | but will allow for mirroring of metadata and dirty data in the future. Your new |
74 | cache set shows up as /sys/fs/bcache/<UUID> | 78 | cache set shows up as /sys/fs/bcache/<UUID> |
@@ -80,11 +84,11 @@ must be attached to your cache set to enable caching. Attaching a backing | |||
80 | device to a cache set is done thusly, with the UUID of the cache set in | 84 | device to a cache set is done thusly, with the UUID of the cache set in |
81 | /sys/fs/bcache: | 85 | /sys/fs/bcache: |
82 | 86 | ||
83 | echo <UUID> > /sys/block/bcache0/bcache/attach | 87 | echo <CSET-UUID> > /sys/block/bcache0/bcache/attach |
84 | 88 | ||
85 | This only has to be done once. The next time you reboot, just reregister all | 89 | This only has to be done once. The next time you reboot, just reregister all |
86 | your bcache devices. If a backing device has data in a cache somewhere, the | 90 | your bcache devices. If a backing device has data in a cache somewhere, the |
87 | /dev/bcache# device won't be created until the cache shows up - particularly | 91 | /dev/bcache<N> device won't be created until the cache shows up - particularly |
88 | important if you have writeback caching turned on. | 92 | important if you have writeback caching turned on. |
89 | 93 | ||
90 | If you're booting up and your cache device is gone and never coming back, you | 94 | If you're booting up and your cache device is gone and never coming back, you |
@@ -191,6 +195,9 @@ want for getting the best possible numbers when benchmarking. | |||
191 | 195 | ||
192 | SYSFS - BACKING DEVICE: | 196 | SYSFS - BACKING DEVICE: |
193 | 197 | ||
198 | Available at /sys/block/<bdev>/bcache, /sys/block/bcache*/bcache and | ||
199 | (if attached) /sys/fs/bcache/<cset-uuid>/bdev* | ||
200 | |||
194 | attach | 201 | attach |
195 | Echo the UUID of a cache set to this file to enable caching. | 202 | Echo the UUID of a cache set to this file to enable caching. |
196 | 203 | ||
@@ -300,6 +307,8 @@ cache_readaheads | |||
300 | 307 | ||
301 | SYSFS - CACHE SET: | 308 | SYSFS - CACHE SET: |
302 | 309 | ||
310 | Available at /sys/fs/bcache/<cset-uuid> | ||
311 | |||
303 | average_key_size | 312 | average_key_size |
304 | Average data per key in the btree. | 313 | Average data per key in the btree. |
305 | 314 | ||
@@ -390,6 +399,8 @@ trigger_gc | |||
390 | 399 | ||
391 | SYSFS - CACHE DEVICE: | 400 | SYSFS - CACHE DEVICE: |
392 | 401 | ||
402 | Available at /sys/block/<cdev>/bcache | ||
403 | |||
393 | block_size | 404 | block_size |
394 | Minimum granularity of writes - should match hardware sector size. | 405 | Minimum granularity of writes - should match hardware sector size. |
395 | 406 | ||
diff --git a/MAINTAINERS b/MAINTAINERS index bf61e04291ab..5d3facfd7899 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -1642,7 +1642,7 @@ S: Maintained | |||
1642 | F: drivers/net/hamradio/baycom* | 1642 | F: drivers/net/hamradio/baycom* |
1643 | 1643 | ||
1644 | BCACHE (BLOCK LAYER CACHE) | 1644 | BCACHE (BLOCK LAYER CACHE) |
1645 | M: Kent Overstreet <koverstreet@google.com> | 1645 | M: Kent Overstreet <kmo@daterainc.com> |
1646 | L: linux-bcache@vger.kernel.org | 1646 | L: linux-bcache@vger.kernel.org |
1647 | W: http://bcache.evilpiepirate.org | 1647 | W: http://bcache.evilpiepirate.org |
1648 | S: Maintained: | 1648 | S: Maintained: |
@@ -3346,7 +3346,7 @@ F: Documentation/firmware_class/ | |||
3346 | F: drivers/base/firmware*.c | 3346 | F: drivers/base/firmware*.c |
3347 | F: include/linux/firmware.h | 3347 | F: include/linux/firmware.h |
3348 | 3348 | ||
3349 | FLASHSYSTEM DRIVER (IBM FlashSystem 70/80 PCI SSD Flash Card) | 3349 | FLASH ADAPTER DRIVER (IBM Flash Adapter 900GB Full Height PCI Flash Card) |
3350 | M: Joshua Morris <josh.h.morris@us.ibm.com> | 3350 | M: Joshua Morris <josh.h.morris@us.ibm.com> |
3351 | M: Philip Kelleher <pjk1939@linux.vnet.ibm.com> | 3351 | M: Philip Kelleher <pjk1939@linux.vnet.ibm.com> |
3352 | S: Maintained | 3352 | S: Maintained |
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index b81ddfea1da0..e07a5fd58ad7 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig | |||
@@ -532,11 +532,11 @@ config BLK_DEV_RBD | |||
532 | If unsure, say N. | 532 | If unsure, say N. |
533 | 533 | ||
534 | config BLK_DEV_RSXX | 534 | config BLK_DEV_RSXX |
535 | tristate "IBM FlashSystem 70/80 PCIe SSD Device Driver" | 535 | tristate "IBM Flash Adapter 900GB Full Height PCIe Device Driver" |
536 | depends on PCI | 536 | depends on PCI |
537 | help | 537 | help |
538 | Device driver for IBM's high speed PCIe SSD | 538 | Device driver for IBM's high speed PCIe SSD |
539 | storage devices: FlashSystem-70 and FlashSystem-80. | 539 | storage device: Flash Adapter 900GB Full Height. |
540 | 540 | ||
541 | To compile this driver as a module, choose M here: the | 541 | To compile this driver as a module, choose M here: the |
542 | module will be called rsxx. | 542 | module will be called rsxx. |
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 6608076dc39e..28c73ca320a8 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c | |||
@@ -659,6 +659,27 @@ void drbd_al_shrink(struct drbd_conf *mdev) | |||
659 | wake_up(&mdev->al_wait); | 659 | wake_up(&mdev->al_wait); |
660 | } | 660 | } |
661 | 661 | ||
662 | int drbd_initialize_al(struct drbd_conf *mdev, void *buffer) | ||
663 | { | ||
664 | struct al_transaction_on_disk *al = buffer; | ||
665 | struct drbd_md *md = &mdev->ldev->md; | ||
666 | sector_t al_base = md->md_offset + md->al_offset; | ||
667 | int al_size_4k = md->al_stripes * md->al_stripe_size_4k; | ||
668 | int i; | ||
669 | |||
670 | memset(al, 0, 4096); | ||
671 | al->magic = cpu_to_be32(DRBD_AL_MAGIC); | ||
672 | al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED); | ||
673 | al->crc32c = cpu_to_be32(crc32c(0, al, 4096)); | ||
674 | |||
675 | for (i = 0; i < al_size_4k; i++) { | ||
676 | int err = drbd_md_sync_page_io(mdev, mdev->ldev, al_base + i * 8, WRITE); | ||
677 | if (err) | ||
678 | return err; | ||
679 | } | ||
680 | return 0; | ||
681 | } | ||
682 | |||
662 | static int w_update_odbm(struct drbd_work *w, int unused) | 683 | static int w_update_odbm(struct drbd_work *w, int unused) |
663 | { | 684 | { |
664 | struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); | 685 | struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); |
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index f943aacfdad8..2d7f608d181c 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -832,6 +832,7 @@ struct drbd_tconn { /* is a resource from the config file */ | |||
832 | unsigned susp_nod:1; /* IO suspended because no data */ | 832 | unsigned susp_nod:1; /* IO suspended because no data */ |
833 | unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ | 833 | unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ |
834 | struct mutex cstate_mutex; /* Protects graceful disconnects */ | 834 | struct mutex cstate_mutex; /* Protects graceful disconnects */ |
835 | unsigned int connect_cnt; /* Inc each time a connection is established */ | ||
835 | 836 | ||
836 | unsigned long flags; | 837 | unsigned long flags; |
837 | struct net_conf *net_conf; /* content protected by rcu */ | 838 | struct net_conf *net_conf; /* content protected by rcu */ |
@@ -1132,6 +1133,7 @@ extern void drbd_mdev_cleanup(struct drbd_conf *mdev); | |||
1132 | void drbd_print_uuids(struct drbd_conf *mdev, const char *text); | 1133 | void drbd_print_uuids(struct drbd_conf *mdev, const char *text); |
1133 | 1134 | ||
1134 | extern void conn_md_sync(struct drbd_tconn *tconn); | 1135 | extern void conn_md_sync(struct drbd_tconn *tconn); |
1136 | extern void drbd_md_write(struct drbd_conf *mdev, void *buffer); | ||
1135 | extern void drbd_md_sync(struct drbd_conf *mdev); | 1137 | extern void drbd_md_sync(struct drbd_conf *mdev); |
1136 | extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); | 1138 | extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); |
1137 | extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); | 1139 | extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); |
@@ -1466,8 +1468,16 @@ extern void drbd_suspend_io(struct drbd_conf *mdev); | |||
1466 | extern void drbd_resume_io(struct drbd_conf *mdev); | 1468 | extern void drbd_resume_io(struct drbd_conf *mdev); |
1467 | extern char *ppsize(char *buf, unsigned long long size); | 1469 | extern char *ppsize(char *buf, unsigned long long size); |
1468 | extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int); | 1470 | extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int); |
1469 | enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; | 1471 | enum determine_dev_size { |
1470 | extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); | 1472 | DS_ERROR_SHRINK = -3, |
1473 | DS_ERROR_SPACE_MD = -2, | ||
1474 | DS_ERROR = -1, | ||
1475 | DS_UNCHANGED = 0, | ||
1476 | DS_SHRUNK = 1, | ||
1477 | DS_GREW = 2 | ||
1478 | }; | ||
1479 | extern enum determine_dev_size | ||
1480 | drbd_determine_dev_size(struct drbd_conf *, enum dds_flags, struct resize_parms *) __must_hold(local); | ||
1471 | extern void resync_after_online_grow(struct drbd_conf *); | 1481 | extern void resync_after_online_grow(struct drbd_conf *); |
1472 | extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev); | 1482 | extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev); |
1473 | extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, | 1483 | extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, |
@@ -1633,6 +1643,7 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, | |||
1633 | #define drbd_set_out_of_sync(mdev, sector, size) \ | 1643 | #define drbd_set_out_of_sync(mdev, sector, size) \ |
1634 | __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) | 1644 | __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) |
1635 | extern void drbd_al_shrink(struct drbd_conf *mdev); | 1645 | extern void drbd_al_shrink(struct drbd_conf *mdev); |
1646 | extern int drbd_initialize_al(struct drbd_conf *, void *); | ||
1636 | 1647 | ||
1637 | /* drbd_nl.c */ | 1648 | /* drbd_nl.c */ |
1638 | /* state info broadcast */ | 1649 | /* state info broadcast */ |
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a5dca6affcbb..55635edf563b 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -2762,8 +2762,6 @@ int __init drbd_init(void) | |||
2762 | /* | 2762 | /* |
2763 | * allocate all necessary structs | 2763 | * allocate all necessary structs |
2764 | */ | 2764 | */ |
2765 | err = -ENOMEM; | ||
2766 | |||
2767 | init_waitqueue_head(&drbd_pp_wait); | 2765 | init_waitqueue_head(&drbd_pp_wait); |
2768 | 2766 | ||
2769 | drbd_proc = NULL; /* play safe for drbd_cleanup */ | 2767 | drbd_proc = NULL; /* play safe for drbd_cleanup */ |
@@ -2773,6 +2771,7 @@ int __init drbd_init(void) | |||
2773 | if (err) | 2771 | if (err) |
2774 | goto fail; | 2772 | goto fail; |
2775 | 2773 | ||
2774 | err = -ENOMEM; | ||
2776 | drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); | 2775 | drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); |
2777 | if (!drbd_proc) { | 2776 | if (!drbd_proc) { |
2778 | printk(KERN_ERR "drbd: unable to register proc file\n"); | 2777 | printk(KERN_ERR "drbd: unable to register proc file\n"); |
@@ -2803,7 +2802,6 @@ int __init drbd_init(void) | |||
2803 | fail: | 2802 | fail: |
2804 | drbd_cleanup(); | 2803 | drbd_cleanup(); |
2805 | if (err == -ENOMEM) | 2804 | if (err == -ENOMEM) |
2806 | /* currently always the case */ | ||
2807 | printk(KERN_ERR "drbd: ran out of memory\n"); | 2805 | printk(KERN_ERR "drbd: ran out of memory\n"); |
2808 | else | 2806 | else |
2809 | printk(KERN_ERR "drbd: initialization failure\n"); | 2807 | printk(KERN_ERR "drbd: initialization failure\n"); |
@@ -2881,34 +2879,14 @@ struct meta_data_on_disk { | |||
2881 | u8 reserved_u8[4096 - (7*8 + 10*4)]; | 2879 | u8 reserved_u8[4096 - (7*8 + 10*4)]; |
2882 | } __packed; | 2880 | } __packed; |
2883 | 2881 | ||
2884 | /** | 2882 | |
2885 | * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set | 2883 | |
2886 | * @mdev: DRBD device. | 2884 | void drbd_md_write(struct drbd_conf *mdev, void *b) |
2887 | */ | ||
2888 | void drbd_md_sync(struct drbd_conf *mdev) | ||
2889 | { | 2885 | { |
2890 | struct meta_data_on_disk *buffer; | 2886 | struct meta_data_on_disk *buffer = b; |
2891 | sector_t sector; | 2887 | sector_t sector; |
2892 | int i; | 2888 | int i; |
2893 | 2889 | ||
2894 | /* Don't accidentally change the DRBD meta data layout. */ | ||
2895 | BUILD_BUG_ON(UI_SIZE != 4); | ||
2896 | BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); | ||
2897 | |||
2898 | del_timer(&mdev->md_sync_timer); | ||
2899 | /* timer may be rearmed by drbd_md_mark_dirty() now. */ | ||
2900 | if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) | ||
2901 | return; | ||
2902 | |||
2903 | /* We use here D_FAILED and not D_ATTACHING because we try to write | ||
2904 | * metadata even if we detach due to a disk failure! */ | ||
2905 | if (!get_ldev_if_state(mdev, D_FAILED)) | ||
2906 | return; | ||
2907 | |||
2908 | buffer = drbd_md_get_buffer(mdev); | ||
2909 | if (!buffer) | ||
2910 | goto out; | ||
2911 | |||
2912 | memset(buffer, 0, sizeof(*buffer)); | 2890 | memset(buffer, 0, sizeof(*buffer)); |
2913 | 2891 | ||
2914 | buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); | 2892 | buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); |
@@ -2937,6 +2915,35 @@ void drbd_md_sync(struct drbd_conf *mdev) | |||
2937 | dev_err(DEV, "meta data update failed!\n"); | 2915 | dev_err(DEV, "meta data update failed!\n"); |
2938 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | 2916 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); |
2939 | } | 2917 | } |
2918 | } | ||
2919 | |||
2920 | /** | ||
2921 | * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set | ||
2922 | * @mdev: DRBD device. | ||
2923 | */ | ||
2924 | void drbd_md_sync(struct drbd_conf *mdev) | ||
2925 | { | ||
2926 | struct meta_data_on_disk *buffer; | ||
2927 | |||
2928 | /* Don't accidentally change the DRBD meta data layout. */ | ||
2929 | BUILD_BUG_ON(UI_SIZE != 4); | ||
2930 | BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); | ||
2931 | |||
2932 | del_timer(&mdev->md_sync_timer); | ||
2933 | /* timer may be rearmed by drbd_md_mark_dirty() now. */ | ||
2934 | if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) | ||
2935 | return; | ||
2936 | |||
2937 | /* We use here D_FAILED and not D_ATTACHING because we try to write | ||
2938 | * metadata even if we detach due to a disk failure! */ | ||
2939 | if (!get_ldev_if_state(mdev, D_FAILED)) | ||
2940 | return; | ||
2941 | |||
2942 | buffer = drbd_md_get_buffer(mdev); | ||
2943 | if (!buffer) | ||
2944 | goto out; | ||
2945 | |||
2946 | drbd_md_write(mdev, buffer); | ||
2940 | 2947 | ||
2941 | /* Update mdev->ldev->md.la_size_sect, | 2948 | /* Update mdev->ldev->md.la_size_sect, |
2942 | * since we updated it on metadata. */ | 2949 | * since we updated it on metadata. */ |
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 9e3f441e7e84..8cc1e640f485 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c | |||
@@ -417,6 +417,7 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn) | |||
417 | 417 | ||
418 | bool conn_try_outdate_peer(struct drbd_tconn *tconn) | 418 | bool conn_try_outdate_peer(struct drbd_tconn *tconn) |
419 | { | 419 | { |
420 | unsigned int connect_cnt; | ||
420 | union drbd_state mask = { }; | 421 | union drbd_state mask = { }; |
421 | union drbd_state val = { }; | 422 | union drbd_state val = { }; |
422 | enum drbd_fencing_p fp; | 423 | enum drbd_fencing_p fp; |
@@ -428,6 +429,10 @@ bool conn_try_outdate_peer(struct drbd_tconn *tconn) | |||
428 | return false; | 429 | return false; |
429 | } | 430 | } |
430 | 431 | ||
432 | spin_lock_irq(&tconn->req_lock); | ||
433 | connect_cnt = tconn->connect_cnt; | ||
434 | spin_unlock_irq(&tconn->req_lock); | ||
435 | |||
431 | fp = highest_fencing_policy(tconn); | 436 | fp = highest_fencing_policy(tconn); |
432 | switch (fp) { | 437 | switch (fp) { |
433 | case FP_NOT_AVAIL: | 438 | case FP_NOT_AVAIL: |
@@ -492,8 +497,14 @@ bool conn_try_outdate_peer(struct drbd_tconn *tconn) | |||
492 | here, because we might were able to re-establish the connection in the | 497 | here, because we might were able to re-establish the connection in the |
493 | meantime. */ | 498 | meantime. */ |
494 | spin_lock_irq(&tconn->req_lock); | 499 | spin_lock_irq(&tconn->req_lock); |
495 | if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) | 500 | if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) { |
496 | _conn_request_state(tconn, mask, val, CS_VERBOSE); | 501 | if (tconn->connect_cnt != connect_cnt) |
502 | /* In case the connection was established and droped | ||
503 | while the fence-peer handler was running, ignore it */ | ||
504 | conn_info(tconn, "Ignoring fence-peer exit code\n"); | ||
505 | else | ||
506 | _conn_request_state(tconn, mask, val, CS_VERBOSE); | ||
507 | } | ||
497 | spin_unlock_irq(&tconn->req_lock); | 508 | spin_unlock_irq(&tconn->req_lock); |
498 | 509 | ||
499 | return conn_highest_pdsk(tconn) <= D_OUTDATED; | 510 | return conn_highest_pdsk(tconn) <= D_OUTDATED; |
@@ -816,15 +827,20 @@ void drbd_resume_io(struct drbd_conf *mdev) | |||
816 | * Returns 0 on success, negative return values indicate errors. | 827 | * Returns 0 on success, negative return values indicate errors. |
817 | * You should call drbd_md_sync() after calling this function. | 828 | * You should call drbd_md_sync() after calling this function. |
818 | */ | 829 | */ |
819 | enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) | 830 | enum determine_dev_size |
831 | drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags, struct resize_parms *rs) __must_hold(local) | ||
820 | { | 832 | { |
821 | sector_t prev_first_sect, prev_size; /* previous meta location */ | 833 | sector_t prev_first_sect, prev_size; /* previous meta location */ |
822 | sector_t la_size_sect, u_size; | 834 | sector_t la_size_sect, u_size; |
835 | struct drbd_md *md = &mdev->ldev->md; | ||
836 | u32 prev_al_stripe_size_4k; | ||
837 | u32 prev_al_stripes; | ||
823 | sector_t size; | 838 | sector_t size; |
824 | char ppb[10]; | 839 | char ppb[10]; |
840 | void *buffer; | ||
825 | 841 | ||
826 | int md_moved, la_size_changed; | 842 | int md_moved, la_size_changed; |
827 | enum determine_dev_size rv = unchanged; | 843 | enum determine_dev_size rv = DS_UNCHANGED; |
828 | 844 | ||
829 | /* race: | 845 | /* race: |
830 | * application request passes inc_ap_bio, | 846 | * application request passes inc_ap_bio, |
@@ -836,6 +852,11 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
836 | * still lock the act_log to not trigger ASSERTs there. | 852 | * still lock the act_log to not trigger ASSERTs there. |
837 | */ | 853 | */ |
838 | drbd_suspend_io(mdev); | 854 | drbd_suspend_io(mdev); |
855 | buffer = drbd_md_get_buffer(mdev); /* Lock meta-data IO */ | ||
856 | if (!buffer) { | ||
857 | drbd_resume_io(mdev); | ||
858 | return DS_ERROR; | ||
859 | } | ||
839 | 860 | ||
840 | /* no wait necessary anymore, actually we could assert that */ | 861 | /* no wait necessary anymore, actually we could assert that */ |
841 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | 862 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); |
@@ -844,7 +865,17 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
844 | prev_size = mdev->ldev->md.md_size_sect; | 865 | prev_size = mdev->ldev->md.md_size_sect; |
845 | la_size_sect = mdev->ldev->md.la_size_sect; | 866 | la_size_sect = mdev->ldev->md.la_size_sect; |
846 | 867 | ||
847 | /* TODO: should only be some assert here, not (re)init... */ | 868 | if (rs) { |
869 | /* rs is non NULL if we should change the AL layout only */ | ||
870 | |||
871 | prev_al_stripes = md->al_stripes; | ||
872 | prev_al_stripe_size_4k = md->al_stripe_size_4k; | ||
873 | |||
874 | md->al_stripes = rs->al_stripes; | ||
875 | md->al_stripe_size_4k = rs->al_stripe_size / 4; | ||
876 | md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4; | ||
877 | } | ||
878 | |||
848 | drbd_md_set_sector_offsets(mdev, mdev->ldev); | 879 | drbd_md_set_sector_offsets(mdev, mdev->ldev); |
849 | 880 | ||
850 | rcu_read_lock(); | 881 | rcu_read_lock(); |
@@ -852,6 +883,21 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
852 | rcu_read_unlock(); | 883 | rcu_read_unlock(); |
853 | size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED); | 884 | size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED); |
854 | 885 | ||
886 | if (size < la_size_sect) { | ||
887 | if (rs && u_size == 0) { | ||
888 | /* Remove "rs &&" later. This check should always be active, but | ||
889 | right now the receiver expects the permissive behavior */ | ||
890 | dev_warn(DEV, "Implicit shrink not allowed. " | ||
891 | "Use --size=%llus for explicit shrink.\n", | ||
892 | (unsigned long long)size); | ||
893 | rv = DS_ERROR_SHRINK; | ||
894 | } | ||
895 | if (u_size > size) | ||
896 | rv = DS_ERROR_SPACE_MD; | ||
897 | if (rv != DS_UNCHANGED) | ||
898 | goto err_out; | ||
899 | } | ||
900 | |||
855 | if (drbd_get_capacity(mdev->this_bdev) != size || | 901 | if (drbd_get_capacity(mdev->this_bdev) != size || |
856 | drbd_bm_capacity(mdev) != size) { | 902 | drbd_bm_capacity(mdev) != size) { |
857 | int err; | 903 | int err; |
@@ -867,7 +913,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
867 | "Leaving size unchanged at size = %lu KB\n", | 913 | "Leaving size unchanged at size = %lu KB\n", |
868 | (unsigned long)size); | 914 | (unsigned long)size); |
869 | } | 915 | } |
870 | rv = dev_size_error; | 916 | rv = DS_ERROR; |
871 | } | 917 | } |
872 | /* racy, see comments above. */ | 918 | /* racy, see comments above. */ |
873 | drbd_set_my_capacity(mdev, size); | 919 | drbd_set_my_capacity(mdev, size); |
@@ -875,38 +921,57 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
875 | dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), | 921 | dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), |
876 | (unsigned long long)size>>1); | 922 | (unsigned long long)size>>1); |
877 | } | 923 | } |
878 | if (rv == dev_size_error) | 924 | if (rv <= DS_ERROR) |
879 | goto out; | 925 | goto err_out; |
880 | 926 | ||
881 | la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect); | 927 | la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect); |
882 | 928 | ||
883 | md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) | 929 | md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) |
884 | || prev_size != mdev->ldev->md.md_size_sect; | 930 | || prev_size != mdev->ldev->md.md_size_sect; |
885 | 931 | ||
886 | if (la_size_changed || md_moved) { | 932 | if (la_size_changed || md_moved || rs) { |
887 | int err; | 933 | u32 prev_flags; |
888 | 934 | ||
889 | drbd_al_shrink(mdev); /* All extents inactive. */ | 935 | drbd_al_shrink(mdev); /* All extents inactive. */ |
936 | |||
937 | prev_flags = md->flags; | ||
938 | md->flags &= ~MDF_PRIMARY_IND; | ||
939 | drbd_md_write(mdev, buffer); | ||
940 | |||
890 | dev_info(DEV, "Writing the whole bitmap, %s\n", | 941 | dev_info(DEV, "Writing the whole bitmap, %s\n", |
891 | la_size_changed && md_moved ? "size changed and md moved" : | 942 | la_size_changed && md_moved ? "size changed and md moved" : |
892 | la_size_changed ? "size changed" : "md moved"); | 943 | la_size_changed ? "size changed" : "md moved"); |
893 | /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ | 944 | /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ |
894 | err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write, | 945 | drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write, |
895 | "size changed", BM_LOCKED_MASK); | 946 | "size changed", BM_LOCKED_MASK); |
896 | if (err) { | 947 | drbd_initialize_al(mdev, buffer); |
897 | rv = dev_size_error; | 948 | |
898 | goto out; | 949 | md->flags = prev_flags; |
899 | } | 950 | drbd_md_write(mdev, buffer); |
900 | drbd_md_mark_dirty(mdev); | 951 | |
952 | if (rs) | ||
953 | dev_info(DEV, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n", | ||
954 | md->al_stripes, md->al_stripe_size_4k * 4); | ||
901 | } | 955 | } |
902 | 956 | ||
903 | if (size > la_size_sect) | 957 | if (size > la_size_sect) |
904 | rv = grew; | 958 | rv = DS_GREW; |
905 | if (size < la_size_sect) | 959 | if (size < la_size_sect) |
906 | rv = shrunk; | 960 | rv = DS_SHRUNK; |
907 | out: | 961 | |
962 | if (0) { | ||
963 | err_out: | ||
964 | if (rs) { | ||
965 | md->al_stripes = prev_al_stripes; | ||
966 | md->al_stripe_size_4k = prev_al_stripe_size_4k; | ||
967 | md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k; | ||
968 | |||
969 | drbd_md_set_sector_offsets(mdev, mdev->ldev); | ||
970 | } | ||
971 | } | ||
908 | lc_unlock(mdev->act_log); | 972 | lc_unlock(mdev->act_log); |
909 | wake_up(&mdev->al_wait); | 973 | wake_up(&mdev->al_wait); |
974 | drbd_md_put_buffer(mdev); | ||
910 | drbd_resume_io(mdev); | 975 | drbd_resume_io(mdev); |
911 | 976 | ||
912 | return rv; | 977 | return rv; |
@@ -1607,11 +1672,11 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | |||
1607 | !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) | 1672 | !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) |
1608 | set_bit(USE_DEGR_WFC_T, &mdev->flags); | 1673 | set_bit(USE_DEGR_WFC_T, &mdev->flags); |
1609 | 1674 | ||
1610 | dd = drbd_determine_dev_size(mdev, 0); | 1675 | dd = drbd_determine_dev_size(mdev, 0, NULL); |
1611 | if (dd == dev_size_error) { | 1676 | if (dd <= DS_ERROR) { |
1612 | retcode = ERR_NOMEM_BITMAP; | 1677 | retcode = ERR_NOMEM_BITMAP; |
1613 | goto force_diskless_dec; | 1678 | goto force_diskless_dec; |
1614 | } else if (dd == grew) | 1679 | } else if (dd == DS_GREW) |
1615 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); | 1680 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); |
1616 | 1681 | ||
1617 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) || | 1682 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) || |
@@ -2305,6 +2370,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) | |||
2305 | struct drbd_conf *mdev; | 2370 | struct drbd_conf *mdev; |
2306 | enum drbd_ret_code retcode; | 2371 | enum drbd_ret_code retcode; |
2307 | enum determine_dev_size dd; | 2372 | enum determine_dev_size dd; |
2373 | bool change_al_layout = false; | ||
2308 | enum dds_flags ddsf; | 2374 | enum dds_flags ddsf; |
2309 | sector_t u_size; | 2375 | sector_t u_size; |
2310 | int err; | 2376 | int err; |
@@ -2315,31 +2381,33 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) | |||
2315 | if (retcode != NO_ERROR) | 2381 | if (retcode != NO_ERROR) |
2316 | goto fail; | 2382 | goto fail; |
2317 | 2383 | ||
2384 | mdev = adm_ctx.mdev; | ||
2385 | if (!get_ldev(mdev)) { | ||
2386 | retcode = ERR_NO_DISK; | ||
2387 | goto fail; | ||
2388 | } | ||
2389 | |||
2318 | memset(&rs, 0, sizeof(struct resize_parms)); | 2390 | memset(&rs, 0, sizeof(struct resize_parms)); |
2391 | rs.al_stripes = mdev->ldev->md.al_stripes; | ||
2392 | rs.al_stripe_size = mdev->ldev->md.al_stripe_size_4k * 4; | ||
2319 | if (info->attrs[DRBD_NLA_RESIZE_PARMS]) { | 2393 | if (info->attrs[DRBD_NLA_RESIZE_PARMS]) { |
2320 | err = resize_parms_from_attrs(&rs, info); | 2394 | err = resize_parms_from_attrs(&rs, info); |
2321 | if (err) { | 2395 | if (err) { |
2322 | retcode = ERR_MANDATORY_TAG; | 2396 | retcode = ERR_MANDATORY_TAG; |
2323 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | 2397 | drbd_msg_put_info(from_attrs_err_to_txt(err)); |
2324 | goto fail; | 2398 | goto fail_ldev; |
2325 | } | 2399 | } |
2326 | } | 2400 | } |
2327 | 2401 | ||
2328 | mdev = adm_ctx.mdev; | ||
2329 | if (mdev->state.conn > C_CONNECTED) { | 2402 | if (mdev->state.conn > C_CONNECTED) { |
2330 | retcode = ERR_RESIZE_RESYNC; | 2403 | retcode = ERR_RESIZE_RESYNC; |
2331 | goto fail; | 2404 | goto fail_ldev; |
2332 | } | 2405 | } |
2333 | 2406 | ||
2334 | if (mdev->state.role == R_SECONDARY && | 2407 | if (mdev->state.role == R_SECONDARY && |
2335 | mdev->state.peer == R_SECONDARY) { | 2408 | mdev->state.peer == R_SECONDARY) { |
2336 | retcode = ERR_NO_PRIMARY; | 2409 | retcode = ERR_NO_PRIMARY; |
2337 | goto fail; | 2410 | goto fail_ldev; |
2338 | } | ||
2339 | |||
2340 | if (!get_ldev(mdev)) { | ||
2341 | retcode = ERR_NO_DISK; | ||
2342 | goto fail; | ||
2343 | } | 2411 | } |
2344 | 2412 | ||
2345 | if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) { | 2413 | if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) { |
@@ -2358,6 +2426,28 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) | |||
2358 | } | 2426 | } |
2359 | } | 2427 | } |
2360 | 2428 | ||
2429 | if (mdev->ldev->md.al_stripes != rs.al_stripes || | ||
2430 | mdev->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) { | ||
2431 | u32 al_size_k = rs.al_stripes * rs.al_stripe_size; | ||
2432 | |||
2433 | if (al_size_k > (16 * 1024 * 1024)) { | ||
2434 | retcode = ERR_MD_LAYOUT_TOO_BIG; | ||
2435 | goto fail_ldev; | ||
2436 | } | ||
2437 | |||
2438 | if (al_size_k < MD_32kB_SECT/2) { | ||
2439 | retcode = ERR_MD_LAYOUT_TOO_SMALL; | ||
2440 | goto fail_ldev; | ||
2441 | } | ||
2442 | |||
2443 | if (mdev->state.conn != C_CONNECTED) { | ||
2444 | retcode = ERR_MD_LAYOUT_CONNECTED; | ||
2445 | goto fail_ldev; | ||
2446 | } | ||
2447 | |||
2448 | change_al_layout = true; | ||
2449 | } | ||
2450 | |||
2361 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) | 2451 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) |
2362 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); | 2452 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); |
2363 | 2453 | ||
@@ -2373,16 +2463,22 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) | |||
2373 | } | 2463 | } |
2374 | 2464 | ||
2375 | ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); | 2465 | ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); |
2376 | dd = drbd_determine_dev_size(mdev, ddsf); | 2466 | dd = drbd_determine_dev_size(mdev, ddsf, change_al_layout ? &rs : NULL); |
2377 | drbd_md_sync(mdev); | 2467 | drbd_md_sync(mdev); |
2378 | put_ldev(mdev); | 2468 | put_ldev(mdev); |
2379 | if (dd == dev_size_error) { | 2469 | if (dd == DS_ERROR) { |
2380 | retcode = ERR_NOMEM_BITMAP; | 2470 | retcode = ERR_NOMEM_BITMAP; |
2381 | goto fail; | 2471 | goto fail; |
2472 | } else if (dd == DS_ERROR_SPACE_MD) { | ||
2473 | retcode = ERR_MD_LAYOUT_NO_FIT; | ||
2474 | goto fail; | ||
2475 | } else if (dd == DS_ERROR_SHRINK) { | ||
2476 | retcode = ERR_IMPLICIT_SHRINK; | ||
2477 | goto fail; | ||
2382 | } | 2478 | } |
2383 | 2479 | ||
2384 | if (mdev->state.conn == C_CONNECTED) { | 2480 | if (mdev->state.conn == C_CONNECTED) { |
2385 | if (dd == grew) | 2481 | if (dd == DS_GREW) |
2386 | set_bit(RESIZE_PENDING, &mdev->flags); | 2482 | set_bit(RESIZE_PENDING, &mdev->flags); |
2387 | 2483 | ||
2388 | drbd_send_uuids(mdev); | 2484 | drbd_send_uuids(mdev); |
@@ -2658,7 +2754,6 @@ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev, | |||
2658 | const struct sib_info *sib) | 2754 | const struct sib_info *sib) |
2659 | { | 2755 | { |
2660 | struct state_info *si = NULL; /* for sizeof(si->member); */ | 2756 | struct state_info *si = NULL; /* for sizeof(si->member); */ |
2661 | struct net_conf *nc; | ||
2662 | struct nlattr *nla; | 2757 | struct nlattr *nla; |
2663 | int got_ldev; | 2758 | int got_ldev; |
2664 | int err = 0; | 2759 | int err = 0; |
@@ -2688,13 +2783,19 @@ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev, | |||
2688 | goto nla_put_failure; | 2783 | goto nla_put_failure; |
2689 | 2784 | ||
2690 | rcu_read_lock(); | 2785 | rcu_read_lock(); |
2691 | if (got_ldev) | 2786 | if (got_ldev) { |
2692 | if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive)) | 2787 | struct disk_conf *disk_conf; |
2693 | goto nla_put_failure; | ||
2694 | 2788 | ||
2695 | nc = rcu_dereference(mdev->tconn->net_conf); | 2789 | disk_conf = rcu_dereference(mdev->ldev->disk_conf); |
2696 | if (nc) | 2790 | err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive); |
2697 | err = net_conf_to_skb(skb, nc, exclude_sensitive); | 2791 | } |
2792 | if (!err) { | ||
2793 | struct net_conf *nc; | ||
2794 | |||
2795 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
2796 | if (nc) | ||
2797 | err = net_conf_to_skb(skb, nc, exclude_sensitive); | ||
2798 | } | ||
2698 | rcu_read_unlock(); | 2799 | rcu_read_unlock(); |
2699 | if (err) | 2800 | if (err) |
2700 | goto nla_put_failure; | 2801 | goto nla_put_failure; |
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 4222affff488..cc29cd3bf78b 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c | |||
@@ -1039,6 +1039,8 @@ randomize: | |||
1039 | rcu_read_lock(); | 1039 | rcu_read_lock(); |
1040 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | 1040 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
1041 | kref_get(&mdev->kref); | 1041 | kref_get(&mdev->kref); |
1042 | rcu_read_unlock(); | ||
1043 | |||
1042 | /* Prevent a race between resync-handshake and | 1044 | /* Prevent a race between resync-handshake and |
1043 | * being promoted to Primary. | 1045 | * being promoted to Primary. |
1044 | * | 1046 | * |
@@ -1049,8 +1051,6 @@ randomize: | |||
1049 | mutex_lock(mdev->state_mutex); | 1051 | mutex_lock(mdev->state_mutex); |
1050 | mutex_unlock(mdev->state_mutex); | 1052 | mutex_unlock(mdev->state_mutex); |
1051 | 1053 | ||
1052 | rcu_read_unlock(); | ||
1053 | |||
1054 | if (discard_my_data) | 1054 | if (discard_my_data) |
1055 | set_bit(DISCARD_MY_DATA, &mdev->flags); | 1055 | set_bit(DISCARD_MY_DATA, &mdev->flags); |
1056 | else | 1056 | else |
@@ -3545,7 +3545,7 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) | |||
3545 | { | 3545 | { |
3546 | struct drbd_conf *mdev; | 3546 | struct drbd_conf *mdev; |
3547 | struct p_sizes *p = pi->data; | 3547 | struct p_sizes *p = pi->data; |
3548 | enum determine_dev_size dd = unchanged; | 3548 | enum determine_dev_size dd = DS_UNCHANGED; |
3549 | sector_t p_size, p_usize, my_usize; | 3549 | sector_t p_size, p_usize, my_usize; |
3550 | int ldsc = 0; /* local disk size changed */ | 3550 | int ldsc = 0; /* local disk size changed */ |
3551 | enum dds_flags ddsf; | 3551 | enum dds_flags ddsf; |
@@ -3617,9 +3617,9 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) | |||
3617 | 3617 | ||
3618 | ddsf = be16_to_cpu(p->dds_flags); | 3618 | ddsf = be16_to_cpu(p->dds_flags); |
3619 | if (get_ldev(mdev)) { | 3619 | if (get_ldev(mdev)) { |
3620 | dd = drbd_determine_dev_size(mdev, ddsf); | 3620 | dd = drbd_determine_dev_size(mdev, ddsf, NULL); |
3621 | put_ldev(mdev); | 3621 | put_ldev(mdev); |
3622 | if (dd == dev_size_error) | 3622 | if (dd == DS_ERROR) |
3623 | return -EIO; | 3623 | return -EIO; |
3624 | drbd_md_sync(mdev); | 3624 | drbd_md_sync(mdev); |
3625 | } else { | 3625 | } else { |
@@ -3647,7 +3647,7 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) | |||
3647 | drbd_send_sizes(mdev, 0, ddsf); | 3647 | drbd_send_sizes(mdev, 0, ddsf); |
3648 | } | 3648 | } |
3649 | if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || | 3649 | if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || |
3650 | (dd == grew && mdev->state.conn == C_CONNECTED)) { | 3650 | (dd == DS_GREW && mdev->state.conn == C_CONNECTED)) { |
3651 | if (mdev->state.pdsk >= D_INCONSISTENT && | 3651 | if (mdev->state.pdsk >= D_INCONSISTENT && |
3652 | mdev->state.disk >= D_INCONSISTENT) { | 3652 | mdev->state.disk >= D_INCONSISTENT) { |
3653 | if (ddsf & DDSF_NO_RESYNC) | 3653 | if (ddsf & DDSF_NO_RESYNC) |
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 90c5be2b1d30..216d47b7e88b 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c | |||
@@ -1115,8 +1115,10 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, | |||
1115 | drbd_thread_restart_nowait(&mdev->tconn->receiver); | 1115 | drbd_thread_restart_nowait(&mdev->tconn->receiver); |
1116 | 1116 | ||
1117 | /* Resume AL writing if we get a connection */ | 1117 | /* Resume AL writing if we get a connection */ |
1118 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) | 1118 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { |
1119 | drbd_resume_al(mdev); | 1119 | drbd_resume_al(mdev); |
1120 | mdev->tconn->connect_cnt++; | ||
1121 | } | ||
1120 | 1122 | ||
1121 | /* remember last attach time so request_timer_fn() won't | 1123 | /* remember last attach time so request_timer_fn() won't |
1122 | * kill newly established sessions while we are still trying to thaw | 1124 | * kill newly established sessions while we are still trying to thaw |
diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 5af21f2db29c..6e85e21445eb 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c | |||
@@ -31,6 +31,8 @@ | |||
31 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
32 | #include <linux/bitops.h> | 32 | #include <linux/bitops.h> |
33 | #include <linux/delay.h> | 33 | #include <linux/delay.h> |
34 | #include <linux/debugfs.h> | ||
35 | #include <linux/seq_file.h> | ||
34 | 36 | ||
35 | #include <linux/genhd.h> | 37 | #include <linux/genhd.h> |
36 | #include <linux/idr.h> | 38 | #include <linux/idr.h> |
@@ -39,8 +41,9 @@ | |||
39 | #include "rsxx_cfg.h" | 41 | #include "rsxx_cfg.h" |
40 | 42 | ||
41 | #define NO_LEGACY 0 | 43 | #define NO_LEGACY 0 |
44 | #define SYNC_START_TIMEOUT (10 * 60) /* 10 minutes */ | ||
42 | 45 | ||
43 | MODULE_DESCRIPTION("IBM FlashSystem 70/80 PCIe SSD Device Driver"); | 46 | MODULE_DESCRIPTION("IBM Flash Adapter 900GB Full Height Device Driver"); |
44 | MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM"); | 47 | MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM"); |
45 | MODULE_LICENSE("GPL"); | 48 | MODULE_LICENSE("GPL"); |
46 | MODULE_VERSION(DRIVER_VERSION); | 49 | MODULE_VERSION(DRIVER_VERSION); |
@@ -49,9 +52,282 @@ static unsigned int force_legacy = NO_LEGACY; | |||
49 | module_param(force_legacy, uint, 0444); | 52 | module_param(force_legacy, uint, 0444); |
50 | MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts"); | 53 | MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts"); |
51 | 54 | ||
55 | static unsigned int sync_start = 1; | ||
56 | module_param(sync_start, uint, 0444); | ||
57 | MODULE_PARM_DESC(sync_start, "On by Default: Driver load will not complete " | ||
58 | "until the card startup has completed."); | ||
59 | |||
52 | static DEFINE_IDA(rsxx_disk_ida); | 60 | static DEFINE_IDA(rsxx_disk_ida); |
53 | static DEFINE_SPINLOCK(rsxx_ida_lock); | 61 | static DEFINE_SPINLOCK(rsxx_ida_lock); |
54 | 62 | ||
63 | /* --------------------Debugfs Setup ------------------- */ | ||
64 | |||
65 | struct rsxx_cram { | ||
66 | u32 f_pos; | ||
67 | u32 offset; | ||
68 | void *i_private; | ||
69 | }; | ||
70 | |||
71 | static int rsxx_attr_pci_regs_show(struct seq_file *m, void *p) | ||
72 | { | ||
73 | struct rsxx_cardinfo *card = m->private; | ||
74 | |||
75 | seq_printf(m, "HWID 0x%08x\n", | ||
76 | ioread32(card->regmap + HWID)); | ||
77 | seq_printf(m, "SCRATCH 0x%08x\n", | ||
78 | ioread32(card->regmap + SCRATCH)); | ||
79 | seq_printf(m, "IER 0x%08x\n", | ||
80 | ioread32(card->regmap + IER)); | ||
81 | seq_printf(m, "IPR 0x%08x\n", | ||
82 | ioread32(card->regmap + IPR)); | ||
83 | seq_printf(m, "CREG_CMD 0x%08x\n", | ||
84 | ioread32(card->regmap + CREG_CMD)); | ||
85 | seq_printf(m, "CREG_ADD 0x%08x\n", | ||
86 | ioread32(card->regmap + CREG_ADD)); | ||
87 | seq_printf(m, "CREG_CNT 0x%08x\n", | ||
88 | ioread32(card->regmap + CREG_CNT)); | ||
89 | seq_printf(m, "CREG_STAT 0x%08x\n", | ||
90 | ioread32(card->regmap + CREG_STAT)); | ||
91 | seq_printf(m, "CREG_DATA0 0x%08x\n", | ||
92 | ioread32(card->regmap + CREG_DATA0)); | ||
93 | seq_printf(m, "CREG_DATA1 0x%08x\n", | ||
94 | ioread32(card->regmap + CREG_DATA1)); | ||
95 | seq_printf(m, "CREG_DATA2 0x%08x\n", | ||
96 | ioread32(card->regmap + CREG_DATA2)); | ||
97 | seq_printf(m, "CREG_DATA3 0x%08x\n", | ||
98 | ioread32(card->regmap + CREG_DATA3)); | ||
99 | seq_printf(m, "CREG_DATA4 0x%08x\n", | ||
100 | ioread32(card->regmap + CREG_DATA4)); | ||
101 | seq_printf(m, "CREG_DATA5 0x%08x\n", | ||
102 | ioread32(card->regmap + CREG_DATA5)); | ||
103 | seq_printf(m, "CREG_DATA6 0x%08x\n", | ||
104 | ioread32(card->regmap + CREG_DATA6)); | ||
105 | seq_printf(m, "CREG_DATA7 0x%08x\n", | ||
106 | ioread32(card->regmap + CREG_DATA7)); | ||
107 | seq_printf(m, "INTR_COAL 0x%08x\n", | ||
108 | ioread32(card->regmap + INTR_COAL)); | ||
109 | seq_printf(m, "HW_ERROR 0x%08x\n", | ||
110 | ioread32(card->regmap + HW_ERROR)); | ||
111 | seq_printf(m, "DEBUG0 0x%08x\n", | ||
112 | ioread32(card->regmap + PCI_DEBUG0)); | ||
113 | seq_printf(m, "DEBUG1 0x%08x\n", | ||
114 | ioread32(card->regmap + PCI_DEBUG1)); | ||
115 | seq_printf(m, "DEBUG2 0x%08x\n", | ||
116 | ioread32(card->regmap + PCI_DEBUG2)); | ||
117 | seq_printf(m, "DEBUG3 0x%08x\n", | ||
118 | ioread32(card->regmap + PCI_DEBUG3)); | ||
119 | seq_printf(m, "DEBUG4 0x%08x\n", | ||
120 | ioread32(card->regmap + PCI_DEBUG4)); | ||
121 | seq_printf(m, "DEBUG5 0x%08x\n", | ||
122 | ioread32(card->regmap + PCI_DEBUG5)); | ||
123 | seq_printf(m, "DEBUG6 0x%08x\n", | ||
124 | ioread32(card->regmap + PCI_DEBUG6)); | ||
125 | seq_printf(m, "DEBUG7 0x%08x\n", | ||
126 | ioread32(card->regmap + PCI_DEBUG7)); | ||
127 | seq_printf(m, "RECONFIG 0x%08x\n", | ||
128 | ioread32(card->regmap + PCI_RECONFIG)); | ||
129 | |||
130 | return 0; | ||
131 | } | ||
132 | |||
133 | static int rsxx_attr_stats_show(struct seq_file *m, void *p) | ||
134 | { | ||
135 | struct rsxx_cardinfo *card = m->private; | ||
136 | int i; | ||
137 | |||
138 | for (i = 0; i < card->n_targets; i++) { | ||
139 | seq_printf(m, "Ctrl %d CRC Errors = %d\n", | ||
140 | i, card->ctrl[i].stats.crc_errors); | ||
141 | seq_printf(m, "Ctrl %d Hard Errors = %d\n", | ||
142 | i, card->ctrl[i].stats.hard_errors); | ||
143 | seq_printf(m, "Ctrl %d Soft Errors = %d\n", | ||
144 | i, card->ctrl[i].stats.soft_errors); | ||
145 | seq_printf(m, "Ctrl %d Writes Issued = %d\n", | ||
146 | i, card->ctrl[i].stats.writes_issued); | ||
147 | seq_printf(m, "Ctrl %d Writes Failed = %d\n", | ||
148 | i, card->ctrl[i].stats.writes_failed); | ||
149 | seq_printf(m, "Ctrl %d Reads Issued = %d\n", | ||
150 | i, card->ctrl[i].stats.reads_issued); | ||
151 | seq_printf(m, "Ctrl %d Reads Failed = %d\n", | ||
152 | i, card->ctrl[i].stats.reads_failed); | ||
153 | seq_printf(m, "Ctrl %d Reads Retried = %d\n", | ||
154 | i, card->ctrl[i].stats.reads_retried); | ||
155 | seq_printf(m, "Ctrl %d Discards Issued = %d\n", | ||
156 | i, card->ctrl[i].stats.discards_issued); | ||
157 | seq_printf(m, "Ctrl %d Discards Failed = %d\n", | ||
158 | i, card->ctrl[i].stats.discards_failed); | ||
159 | seq_printf(m, "Ctrl %d DMA SW Errors = %d\n", | ||
160 | i, card->ctrl[i].stats.dma_sw_err); | ||
161 | seq_printf(m, "Ctrl %d DMA HW Faults = %d\n", | ||
162 | i, card->ctrl[i].stats.dma_hw_fault); | ||
163 | seq_printf(m, "Ctrl %d DMAs Cancelled = %d\n", | ||
164 | i, card->ctrl[i].stats.dma_cancelled); | ||
165 | seq_printf(m, "Ctrl %d SW Queue Depth = %d\n", | ||
166 | i, card->ctrl[i].stats.sw_q_depth); | ||
167 | seq_printf(m, "Ctrl %d HW Queue Depth = %d\n", | ||
168 | i, atomic_read(&card->ctrl[i].stats.hw_q_depth)); | ||
169 | } | ||
170 | |||
171 | return 0; | ||
172 | } | ||
173 | |||
174 | static int rsxx_attr_stats_open(struct inode *inode, struct file *file) | ||
175 | { | ||
176 | return single_open(file, rsxx_attr_stats_show, inode->i_private); | ||
177 | } | ||
178 | |||
179 | static int rsxx_attr_pci_regs_open(struct inode *inode, struct file *file) | ||
180 | { | ||
181 | return single_open(file, rsxx_attr_pci_regs_show, inode->i_private); | ||
182 | } | ||
183 | |||
184 | static ssize_t rsxx_cram_read(struct file *fp, char __user *ubuf, | ||
185 | size_t cnt, loff_t *ppos) | ||
186 | { | ||
187 | struct rsxx_cram *info = fp->private_data; | ||
188 | struct rsxx_cardinfo *card = info->i_private; | ||
189 | char *buf; | ||
190 | int st; | ||
191 | |||
192 | buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL); | ||
193 | if (!buf) | ||
194 | return -ENOMEM; | ||
195 | |||
196 | info->f_pos = (u32)*ppos + info->offset; | ||
197 | |||
198 | st = rsxx_creg_read(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1); | ||
199 | if (st) | ||
200 | return st; | ||
201 | |||
202 | st = copy_to_user(ubuf, buf, cnt); | ||
203 | if (st) | ||
204 | return st; | ||
205 | |||
206 | info->offset += cnt; | ||
207 | |||
208 | kfree(buf); | ||
209 | |||
210 | return cnt; | ||
211 | } | ||
212 | |||
213 | static ssize_t rsxx_cram_write(struct file *fp, const char __user *ubuf, | ||
214 | size_t cnt, loff_t *ppos) | ||
215 | { | ||
216 | struct rsxx_cram *info = fp->private_data; | ||
217 | struct rsxx_cardinfo *card = info->i_private; | ||
218 | char *buf; | ||
219 | int st; | ||
220 | |||
221 | buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL); | ||
222 | if (!buf) | ||
223 | return -ENOMEM; | ||
224 | |||
225 | st = copy_from_user(buf, ubuf, cnt); | ||
226 | if (st) | ||
227 | return st; | ||
228 | |||
229 | info->f_pos = (u32)*ppos + info->offset; | ||
230 | |||
231 | st = rsxx_creg_write(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1); | ||
232 | if (st) | ||
233 | return st; | ||
234 | |||
235 | info->offset += cnt; | ||
236 | |||
237 | kfree(buf); | ||
238 | |||
239 | return cnt; | ||
240 | } | ||
241 | |||
242 | static int rsxx_cram_open(struct inode *inode, struct file *file) | ||
243 | { | ||
244 | struct rsxx_cram *info = kzalloc(sizeof(*info), GFP_KERNEL); | ||
245 | if (!info) | ||
246 | return -ENOMEM; | ||
247 | |||
248 | info->i_private = inode->i_private; | ||
249 | info->f_pos = file->f_pos; | ||
250 | file->private_data = info; | ||
251 | |||
252 | return 0; | ||
253 | } | ||
254 | |||
255 | static int rsxx_cram_release(struct inode *inode, struct file *file) | ||
256 | { | ||
257 | struct rsxx_cram *info = file->private_data; | ||
258 | |||
259 | if (!info) | ||
260 | return 0; | ||
261 | |||
262 | kfree(info); | ||
263 | file->private_data = NULL; | ||
264 | |||
265 | return 0; | ||
266 | } | ||
267 | |||
268 | static const struct file_operations debugfs_cram_fops = { | ||
269 | .owner = THIS_MODULE, | ||
270 | .open = rsxx_cram_open, | ||
271 | .read = rsxx_cram_read, | ||
272 | .write = rsxx_cram_write, | ||
273 | .release = rsxx_cram_release, | ||
274 | }; | ||
275 | |||
276 | static const struct file_operations debugfs_stats_fops = { | ||
277 | .owner = THIS_MODULE, | ||
278 | .open = rsxx_attr_stats_open, | ||
279 | .read = seq_read, | ||
280 | .llseek = seq_lseek, | ||
281 | .release = single_release, | ||
282 | }; | ||
283 | |||
284 | static const struct file_operations debugfs_pci_regs_fops = { | ||
285 | .owner = THIS_MODULE, | ||
286 | .open = rsxx_attr_pci_regs_open, | ||
287 | .read = seq_read, | ||
288 | .llseek = seq_lseek, | ||
289 | .release = single_release, | ||
290 | }; | ||
291 | |||
292 | static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card) | ||
293 | { | ||
294 | struct dentry *debugfs_stats; | ||
295 | struct dentry *debugfs_pci_regs; | ||
296 | struct dentry *debugfs_cram; | ||
297 | |||
298 | card->debugfs_dir = debugfs_create_dir(card->gendisk->disk_name, NULL); | ||
299 | if (IS_ERR_OR_NULL(card->debugfs_dir)) | ||
300 | goto failed_debugfs_dir; | ||
301 | |||
302 | debugfs_stats = debugfs_create_file("stats", S_IRUGO, | ||
303 | card->debugfs_dir, card, | ||
304 | &debugfs_stats_fops); | ||
305 | if (IS_ERR_OR_NULL(debugfs_stats)) | ||
306 | goto failed_debugfs_stats; | ||
307 | |||
308 | debugfs_pci_regs = debugfs_create_file("pci_regs", S_IRUGO, | ||
309 | card->debugfs_dir, card, | ||
310 | &debugfs_pci_regs_fops); | ||
311 | if (IS_ERR_OR_NULL(debugfs_pci_regs)) | ||
312 | goto failed_debugfs_pci_regs; | ||
313 | |||
314 | debugfs_cram = debugfs_create_file("cram", S_IRUGO | S_IWUSR, | ||
315 | card->debugfs_dir, card, | ||
316 | &debugfs_cram_fops); | ||
317 | if (IS_ERR_OR_NULL(debugfs_cram)) | ||
318 | goto failed_debugfs_cram; | ||
319 | |||
320 | return; | ||
321 | failed_debugfs_cram: | ||
322 | debugfs_remove(debugfs_pci_regs); | ||
323 | failed_debugfs_pci_regs: | ||
324 | debugfs_remove(debugfs_stats); | ||
325 | failed_debugfs_stats: | ||
326 | debugfs_remove(card->debugfs_dir); | ||
327 | failed_debugfs_dir: | ||
328 | card->debugfs_dir = NULL; | ||
329 | } | ||
330 | |||
55 | /*----------------- Interrupt Control & Handling -------------------*/ | 331 | /*----------------- Interrupt Control & Handling -------------------*/ |
56 | 332 | ||
57 | static void rsxx_mask_interrupts(struct rsxx_cardinfo *card) | 333 | static void rsxx_mask_interrupts(struct rsxx_cardinfo *card) |
@@ -163,12 +439,13 @@ static irqreturn_t rsxx_isr(int irq, void *pdata) | |||
163 | } | 439 | } |
164 | 440 | ||
165 | if (isr & CR_INTR_CREG) { | 441 | if (isr & CR_INTR_CREG) { |
166 | schedule_work(&card->creg_ctrl.done_work); | 442 | queue_work(card->creg_ctrl.creg_wq, |
443 | &card->creg_ctrl.done_work); | ||
167 | handled++; | 444 | handled++; |
168 | } | 445 | } |
169 | 446 | ||
170 | if (isr & CR_INTR_EVENT) { | 447 | if (isr & CR_INTR_EVENT) { |
171 | schedule_work(&card->event_work); | 448 | queue_work(card->event_wq, &card->event_work); |
172 | rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); | 449 | rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); |
173 | handled++; | 450 | handled++; |
174 | } | 451 | } |
@@ -329,7 +606,7 @@ static int rsxx_eeh_frozen(struct pci_dev *dev) | |||
329 | int i; | 606 | int i; |
330 | int st; | 607 | int st; |
331 | 608 | ||
332 | dev_warn(&dev->dev, "IBM FlashSystem PCI: preparing for slot reset.\n"); | 609 | dev_warn(&dev->dev, "IBM Flash Adapter PCI: preparing for slot reset.\n"); |
333 | 610 | ||
334 | card->eeh_state = 1; | 611 | card->eeh_state = 1; |
335 | rsxx_mask_interrupts(card); | 612 | rsxx_mask_interrupts(card); |
@@ -367,15 +644,26 @@ static void rsxx_eeh_failure(struct pci_dev *dev) | |||
367 | { | 644 | { |
368 | struct rsxx_cardinfo *card = pci_get_drvdata(dev); | 645 | struct rsxx_cardinfo *card = pci_get_drvdata(dev); |
369 | int i; | 646 | int i; |
647 | int cnt = 0; | ||
370 | 648 | ||
371 | dev_err(&dev->dev, "IBM FlashSystem PCI: disabling failed card.\n"); | 649 | dev_err(&dev->dev, "IBM Flash Adapter PCI: disabling failed card.\n"); |
372 | 650 | ||
373 | card->eeh_state = 1; | 651 | card->eeh_state = 1; |
652 | card->halt = 1; | ||
374 | 653 | ||
375 | for (i = 0; i < card->n_targets; i++) | 654 | for (i = 0; i < card->n_targets; i++) { |
376 | del_timer_sync(&card->ctrl[i].activity_timer); | 655 | spin_lock_bh(&card->ctrl[i].queue_lock); |
656 | cnt = rsxx_cleanup_dma_queue(&card->ctrl[i], | ||
657 | &card->ctrl[i].queue); | ||
658 | spin_unlock_bh(&card->ctrl[i].queue_lock); | ||
659 | |||
660 | cnt += rsxx_dma_cancel(&card->ctrl[i]); | ||
377 | 661 | ||
378 | rsxx_eeh_cancel_dmas(card); | 662 | if (cnt) |
663 | dev_info(CARD_TO_DEV(card), | ||
664 | "Freed %d queued DMAs on channel %d\n", | ||
665 | cnt, card->ctrl[i].id); | ||
666 | } | ||
379 | } | 667 | } |
380 | 668 | ||
381 | static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card) | 669 | static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card) |
@@ -432,7 +720,7 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) | |||
432 | int st; | 720 | int st; |
433 | 721 | ||
434 | dev_warn(&dev->dev, | 722 | dev_warn(&dev->dev, |
435 | "IBM FlashSystem PCI: recovering from slot reset.\n"); | 723 | "IBM Flash Adapter PCI: recovering from slot reset.\n"); |
436 | 724 | ||
437 | st = pci_enable_device(dev); | 725 | st = pci_enable_device(dev); |
438 | if (st) | 726 | if (st) |
@@ -485,7 +773,7 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) | |||
485 | &card->ctrl[i].issue_dma_work); | 773 | &card->ctrl[i].issue_dma_work); |
486 | } | 774 | } |
487 | 775 | ||
488 | dev_info(&dev->dev, "IBM FlashSystem PCI: recovery complete.\n"); | 776 | dev_info(&dev->dev, "IBM Flash Adapter PCI: recovery complete.\n"); |
489 | 777 | ||
490 | return PCI_ERS_RESULT_RECOVERED; | 778 | return PCI_ERS_RESULT_RECOVERED; |
491 | 779 | ||
@@ -528,6 +816,7 @@ static int rsxx_pci_probe(struct pci_dev *dev, | |||
528 | { | 816 | { |
529 | struct rsxx_cardinfo *card; | 817 | struct rsxx_cardinfo *card; |
530 | int st; | 818 | int st; |
819 | unsigned int sync_timeout; | ||
531 | 820 | ||
532 | dev_info(&dev->dev, "PCI-Flash SSD discovered\n"); | 821 | dev_info(&dev->dev, "PCI-Flash SSD discovered\n"); |
533 | 822 | ||
@@ -610,7 +899,11 @@ static int rsxx_pci_probe(struct pci_dev *dev, | |||
610 | } | 899 | } |
611 | 900 | ||
612 | /************* Setup Processor Command Interface *************/ | 901 | /************* Setup Processor Command Interface *************/ |
613 | rsxx_creg_setup(card); | 902 | st = rsxx_creg_setup(card); |
903 | if (st) { | ||
904 | dev_err(CARD_TO_DEV(card), "Failed to setup creg interface.\n"); | ||
905 | goto failed_creg_setup; | ||
906 | } | ||
614 | 907 | ||
615 | spin_lock_irq(&card->irq_lock); | 908 | spin_lock_irq(&card->irq_lock); |
616 | rsxx_enable_ier_and_isr(card, CR_INTR_CREG); | 909 | rsxx_enable_ier_and_isr(card, CR_INTR_CREG); |
@@ -650,6 +943,12 @@ static int rsxx_pci_probe(struct pci_dev *dev, | |||
650 | } | 943 | } |
651 | 944 | ||
652 | /************* Setup Card Event Handler *************/ | 945 | /************* Setup Card Event Handler *************/ |
946 | card->event_wq = create_singlethread_workqueue(DRIVER_NAME"_event"); | ||
947 | if (!card->event_wq) { | ||
948 | dev_err(CARD_TO_DEV(card), "Failed card event setup.\n"); | ||
949 | goto failed_event_handler; | ||
950 | } | ||
951 | |||
653 | INIT_WORK(&card->event_work, card_event_handler); | 952 | INIT_WORK(&card->event_work, card_event_handler); |
654 | 953 | ||
655 | st = rsxx_setup_dev(card); | 954 | st = rsxx_setup_dev(card); |
@@ -676,6 +975,33 @@ static int rsxx_pci_probe(struct pci_dev *dev, | |||
676 | if (st) | 975 | if (st) |
677 | dev_crit(CARD_TO_DEV(card), | 976 | dev_crit(CARD_TO_DEV(card), |
678 | "Failed issuing card startup\n"); | 977 | "Failed issuing card startup\n"); |
978 | if (sync_start) { | ||
979 | sync_timeout = SYNC_START_TIMEOUT; | ||
980 | |||
981 | dev_info(CARD_TO_DEV(card), | ||
982 | "Waiting for card to startup\n"); | ||
983 | |||
984 | do { | ||
985 | ssleep(1); | ||
986 | sync_timeout--; | ||
987 | |||
988 | rsxx_get_card_state(card, &card->state); | ||
989 | } while (sync_timeout && | ||
990 | (card->state == CARD_STATE_STARTING)); | ||
991 | |||
992 | if (card->state == CARD_STATE_STARTING) { | ||
993 | dev_warn(CARD_TO_DEV(card), | ||
994 | "Card startup timed out\n"); | ||
995 | card->size8 = 0; | ||
996 | } else { | ||
997 | dev_info(CARD_TO_DEV(card), | ||
998 | "card state: %s\n", | ||
999 | rsxx_card_state_to_str(card->state)); | ||
1000 | st = rsxx_get_card_size8(card, &card->size8); | ||
1001 | if (st) | ||
1002 | card->size8 = 0; | ||
1003 | } | ||
1004 | } | ||
679 | } else if (card->state == CARD_STATE_GOOD || | 1005 | } else if (card->state == CARD_STATE_GOOD || |
680 | card->state == CARD_STATE_RD_ONLY_FAULT) { | 1006 | card->state == CARD_STATE_RD_ONLY_FAULT) { |
681 | st = rsxx_get_card_size8(card, &card->size8); | 1007 | st = rsxx_get_card_size8(card, &card->size8); |
@@ -685,12 +1011,21 @@ static int rsxx_pci_probe(struct pci_dev *dev, | |||
685 | 1011 | ||
686 | rsxx_attach_dev(card); | 1012 | rsxx_attach_dev(card); |
687 | 1013 | ||
1014 | /************* Setup Debugfs *************/ | ||
1015 | rsxx_debugfs_dev_new(card); | ||
1016 | |||
688 | return 0; | 1017 | return 0; |
689 | 1018 | ||
690 | failed_create_dev: | 1019 | failed_create_dev: |
1020 | destroy_workqueue(card->event_wq); | ||
1021 | card->event_wq = NULL; | ||
1022 | failed_event_handler: | ||
691 | rsxx_dma_destroy(card); | 1023 | rsxx_dma_destroy(card); |
692 | failed_dma_setup: | 1024 | failed_dma_setup: |
693 | failed_compatiblity_check: | 1025 | failed_compatiblity_check: |
1026 | destroy_workqueue(card->creg_ctrl.creg_wq); | ||
1027 | card->creg_ctrl.creg_wq = NULL; | ||
1028 | failed_creg_setup: | ||
694 | spin_lock_irq(&card->irq_lock); | 1029 | spin_lock_irq(&card->irq_lock); |
695 | rsxx_disable_ier_and_isr(card, CR_INTR_ALL); | 1030 | rsxx_disable_ier_and_isr(card, CR_INTR_ALL); |
696 | spin_unlock_irq(&card->irq_lock); | 1031 | spin_unlock_irq(&card->irq_lock); |
@@ -756,6 +1091,8 @@ static void rsxx_pci_remove(struct pci_dev *dev) | |||
756 | /* Prevent work_structs from re-queuing themselves. */ | 1091 | /* Prevent work_structs from re-queuing themselves. */ |
757 | card->halt = 1; | 1092 | card->halt = 1; |
758 | 1093 | ||
1094 | debugfs_remove_recursive(card->debugfs_dir); | ||
1095 | |||
759 | free_irq(dev->irq, card); | 1096 | free_irq(dev->irq, card); |
760 | 1097 | ||
761 | if (!force_legacy) | 1098 | if (!force_legacy) |
diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c index 4b5c020a0a65..926dce9c452f 100644 --- a/drivers/block/rsxx/cregs.c +++ b/drivers/block/rsxx/cregs.c | |||
@@ -431,6 +431,15 @@ static int __issue_creg_rw(struct rsxx_cardinfo *card, | |||
431 | *hw_stat = completion.creg_status; | 431 | *hw_stat = completion.creg_status; |
432 | 432 | ||
433 | if (completion.st) { | 433 | if (completion.st) { |
434 | /* | ||
435 | * This read is needed to verify that there has not been any | ||
436 | * extreme errors that might have occurred, i.e. EEH. The | ||
437 | * function iowrite32 will not detect EEH errors, so it is | ||
438 | * necessary that we recover if such an error is the reason | ||
439 | * for the timeout. This is a dummy read. | ||
440 | */ | ||
441 | ioread32(card->regmap + SCRATCH); | ||
442 | |||
434 | dev_warn(CARD_TO_DEV(card), | 443 | dev_warn(CARD_TO_DEV(card), |
435 | "creg command failed(%d x%08x)\n", | 444 | "creg command failed(%d x%08x)\n", |
436 | completion.st, addr); | 445 | completion.st, addr); |
@@ -727,6 +736,11 @@ int rsxx_creg_setup(struct rsxx_cardinfo *card) | |||
727 | { | 736 | { |
728 | card->creg_ctrl.active_cmd = NULL; | 737 | card->creg_ctrl.active_cmd = NULL; |
729 | 738 | ||
739 | card->creg_ctrl.creg_wq = | ||
740 | create_singlethread_workqueue(DRIVER_NAME"_creg"); | ||
741 | if (!card->creg_ctrl.creg_wq) | ||
742 | return -ENOMEM; | ||
743 | |||
730 | INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done); | 744 | INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done); |
731 | mutex_init(&card->creg_ctrl.reset_lock); | 745 | mutex_init(&card->creg_ctrl.reset_lock); |
732 | INIT_LIST_HEAD(&card->creg_ctrl.queue); | 746 | INIT_LIST_HEAD(&card->creg_ctrl.queue); |
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 4346d17d2949..d7af441880be 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c | |||
@@ -155,7 +155,8 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card, | |||
155 | atomic_set(&meta->error, 1); | 155 | atomic_set(&meta->error, 1); |
156 | 156 | ||
157 | if (atomic_dec_and_test(&meta->pending_dmas)) { | 157 | if (atomic_dec_and_test(&meta->pending_dmas)) { |
158 | disk_stats_complete(card, meta->bio, meta->start_time); | 158 | if (!card->eeh_state && card->gendisk) |
159 | disk_stats_complete(card, meta->bio, meta->start_time); | ||
159 | 160 | ||
160 | bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0); | 161 | bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0); |
161 | kmem_cache_free(bio_meta_pool, meta); | 162 | kmem_cache_free(bio_meta_pool, meta); |
@@ -170,6 +171,12 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio) | |||
170 | 171 | ||
171 | might_sleep(); | 172 | might_sleep(); |
172 | 173 | ||
174 | if (!card) | ||
175 | goto req_err; | ||
176 | |||
177 | if (bio->bi_sector + (bio->bi_size >> 9) > get_capacity(card->gendisk)) | ||
178 | goto req_err; | ||
179 | |||
173 | if (unlikely(card->halt)) { | 180 | if (unlikely(card->halt)) { |
174 | st = -EFAULT; | 181 | st = -EFAULT; |
175 | goto req_err; | 182 | goto req_err; |
@@ -196,7 +203,8 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio) | |||
196 | atomic_set(&bio_meta->pending_dmas, 0); | 203 | atomic_set(&bio_meta->pending_dmas, 0); |
197 | bio_meta->start_time = jiffies; | 204 | bio_meta->start_time = jiffies; |
198 | 205 | ||
199 | disk_stats_start(card, bio); | 206 | if (!unlikely(card->halt)) |
207 | disk_stats_start(card, bio); | ||
200 | 208 | ||
201 | dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n", | 209 | dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n", |
202 | bio_data_dir(bio) ? 'W' : 'R', bio_meta, | 210 | bio_data_dir(bio) ? 'W' : 'R', bio_meta, |
@@ -225,24 +233,6 @@ static bool rsxx_discard_supported(struct rsxx_cardinfo *card) | |||
225 | return (pci_rev >= RSXX_DISCARD_SUPPORT); | 233 | return (pci_rev >= RSXX_DISCARD_SUPPORT); |
226 | } | 234 | } |
227 | 235 | ||
228 | static unsigned short rsxx_get_logical_block_size( | ||
229 | struct rsxx_cardinfo *card) | ||
230 | { | ||
231 | u32 capabilities = 0; | ||
232 | int st; | ||
233 | |||
234 | st = rsxx_get_card_capabilities(card, &capabilities); | ||
235 | if (st) | ||
236 | dev_warn(CARD_TO_DEV(card), | ||
237 | "Failed reading card capabilities register\n"); | ||
238 | |||
239 | /* Earlier firmware did not have support for 512 byte accesses */ | ||
240 | if (capabilities & CARD_CAP_SUBPAGE_WRITES) | ||
241 | return 512; | ||
242 | else | ||
243 | return RSXX_HW_BLK_SIZE; | ||
244 | } | ||
245 | |||
246 | int rsxx_attach_dev(struct rsxx_cardinfo *card) | 236 | int rsxx_attach_dev(struct rsxx_cardinfo *card) |
247 | { | 237 | { |
248 | mutex_lock(&card->dev_lock); | 238 | mutex_lock(&card->dev_lock); |
@@ -305,7 +295,7 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) | |||
305 | return -ENOMEM; | 295 | return -ENOMEM; |
306 | } | 296 | } |
307 | 297 | ||
308 | blk_size = rsxx_get_logical_block_size(card); | 298 | blk_size = card->config.data.block_size; |
309 | 299 | ||
310 | blk_queue_make_request(card->queue, rsxx_make_request); | 300 | blk_queue_make_request(card->queue, rsxx_make_request); |
311 | blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY); | 301 | blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY); |
@@ -347,6 +337,7 @@ void rsxx_destroy_dev(struct rsxx_cardinfo *card) | |||
347 | card->gendisk = NULL; | 337 | card->gendisk = NULL; |
348 | 338 | ||
349 | blk_cleanup_queue(card->queue); | 339 | blk_cleanup_queue(card->queue); |
340 | card->queue->queuedata = NULL; | ||
350 | unregister_blkdev(card->major, DRIVER_NAME); | 341 | unregister_blkdev(card->major, DRIVER_NAME); |
351 | } | 342 | } |
352 | 343 | ||
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index 0607513cfb41..bed32f16b084 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c | |||
@@ -245,6 +245,22 @@ static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl, | |||
245 | kmem_cache_free(rsxx_dma_pool, dma); | 245 | kmem_cache_free(rsxx_dma_pool, dma); |
246 | } | 246 | } |
247 | 247 | ||
248 | int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, | ||
249 | struct list_head *q) | ||
250 | { | ||
251 | struct rsxx_dma *dma; | ||
252 | struct rsxx_dma *tmp; | ||
253 | int cnt = 0; | ||
254 | |||
255 | list_for_each_entry_safe(dma, tmp, q, list) { | ||
256 | list_del(&dma->list); | ||
257 | rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); | ||
258 | cnt++; | ||
259 | } | ||
260 | |||
261 | return cnt; | ||
262 | } | ||
263 | |||
248 | static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl, | 264 | static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl, |
249 | struct rsxx_dma *dma) | 265 | struct rsxx_dma *dma) |
250 | { | 266 | { |
@@ -252,9 +268,10 @@ static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl, | |||
252 | * Requeued DMAs go to the front of the queue so they are issued | 268 | * Requeued DMAs go to the front of the queue so they are issued |
253 | * first. | 269 | * first. |
254 | */ | 270 | */ |
255 | spin_lock(&ctrl->queue_lock); | 271 | spin_lock_bh(&ctrl->queue_lock); |
272 | ctrl->stats.sw_q_depth++; | ||
256 | list_add(&dma->list, &ctrl->queue); | 273 | list_add(&dma->list, &ctrl->queue); |
257 | spin_unlock(&ctrl->queue_lock); | 274 | spin_unlock_bh(&ctrl->queue_lock); |
258 | } | 275 | } |
259 | 276 | ||
260 | static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl, | 277 | static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl, |
@@ -329,6 +346,7 @@ static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl, | |||
329 | static void dma_engine_stalled(unsigned long data) | 346 | static void dma_engine_stalled(unsigned long data) |
330 | { | 347 | { |
331 | struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data; | 348 | struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data; |
349 | int cnt; | ||
332 | 350 | ||
333 | if (atomic_read(&ctrl->stats.hw_q_depth) == 0 || | 351 | if (atomic_read(&ctrl->stats.hw_q_depth) == 0 || |
334 | unlikely(ctrl->card->eeh_state)) | 352 | unlikely(ctrl->card->eeh_state)) |
@@ -349,18 +367,28 @@ static void dma_engine_stalled(unsigned long data) | |||
349 | "DMA channel %d has stalled, faulting interface.\n", | 367 | "DMA channel %d has stalled, faulting interface.\n", |
350 | ctrl->id); | 368 | ctrl->id); |
351 | ctrl->card->dma_fault = 1; | 369 | ctrl->card->dma_fault = 1; |
370 | |||
371 | /* Clean up the DMA queue */ | ||
372 | spin_lock(&ctrl->queue_lock); | ||
373 | cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue); | ||
374 | spin_unlock(&ctrl->queue_lock); | ||
375 | |||
376 | cnt += rsxx_dma_cancel(ctrl); | ||
377 | |||
378 | if (cnt) | ||
379 | dev_info(CARD_TO_DEV(ctrl->card), | ||
380 | "Freed %d queued DMAs on channel %d\n", | ||
381 | cnt, ctrl->id); | ||
352 | } | 382 | } |
353 | } | 383 | } |
354 | 384 | ||
355 | static void rsxx_issue_dmas(struct work_struct *work) | 385 | static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl) |
356 | { | 386 | { |
357 | struct rsxx_dma_ctrl *ctrl; | ||
358 | struct rsxx_dma *dma; | 387 | struct rsxx_dma *dma; |
359 | int tag; | 388 | int tag; |
360 | int cmds_pending = 0; | 389 | int cmds_pending = 0; |
361 | struct hw_cmd *hw_cmd_buf; | 390 | struct hw_cmd *hw_cmd_buf; |
362 | 391 | ||
363 | ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work); | ||
364 | hw_cmd_buf = ctrl->cmd.buf; | 392 | hw_cmd_buf = ctrl->cmd.buf; |
365 | 393 | ||
366 | if (unlikely(ctrl->card->halt) || | 394 | if (unlikely(ctrl->card->halt) || |
@@ -368,22 +396,22 @@ static void rsxx_issue_dmas(struct work_struct *work) | |||
368 | return; | 396 | return; |
369 | 397 | ||
370 | while (1) { | 398 | while (1) { |
371 | spin_lock(&ctrl->queue_lock); | 399 | spin_lock_bh(&ctrl->queue_lock); |
372 | if (list_empty(&ctrl->queue)) { | 400 | if (list_empty(&ctrl->queue)) { |
373 | spin_unlock(&ctrl->queue_lock); | 401 | spin_unlock_bh(&ctrl->queue_lock); |
374 | break; | 402 | break; |
375 | } | 403 | } |
376 | spin_unlock(&ctrl->queue_lock); | 404 | spin_unlock_bh(&ctrl->queue_lock); |
377 | 405 | ||
378 | tag = pop_tracker(ctrl->trackers); | 406 | tag = pop_tracker(ctrl->trackers); |
379 | if (tag == -1) | 407 | if (tag == -1) |
380 | break; | 408 | break; |
381 | 409 | ||
382 | spin_lock(&ctrl->queue_lock); | 410 | spin_lock_bh(&ctrl->queue_lock); |
383 | dma = list_entry(ctrl->queue.next, struct rsxx_dma, list); | 411 | dma = list_entry(ctrl->queue.next, struct rsxx_dma, list); |
384 | list_del(&dma->list); | 412 | list_del(&dma->list); |
385 | ctrl->stats.sw_q_depth--; | 413 | ctrl->stats.sw_q_depth--; |
386 | spin_unlock(&ctrl->queue_lock); | 414 | spin_unlock_bh(&ctrl->queue_lock); |
387 | 415 | ||
388 | /* | 416 | /* |
389 | * This will catch any DMAs that slipped in right before the | 417 | * This will catch any DMAs that slipped in right before the |
@@ -440,9 +468,8 @@ static void rsxx_issue_dmas(struct work_struct *work) | |||
440 | } | 468 | } |
441 | } | 469 | } |
442 | 470 | ||
443 | static void rsxx_dma_done(struct work_struct *work) | 471 | static void rsxx_dma_done(struct rsxx_dma_ctrl *ctrl) |
444 | { | 472 | { |
445 | struct rsxx_dma_ctrl *ctrl; | ||
446 | struct rsxx_dma *dma; | 473 | struct rsxx_dma *dma; |
447 | unsigned long flags; | 474 | unsigned long flags; |
448 | u16 count; | 475 | u16 count; |
@@ -450,7 +477,6 @@ static void rsxx_dma_done(struct work_struct *work) | |||
450 | u8 tag; | 477 | u8 tag; |
451 | struct hw_status *hw_st_buf; | 478 | struct hw_status *hw_st_buf; |
452 | 479 | ||
453 | ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work); | ||
454 | hw_st_buf = ctrl->status.buf; | 480 | hw_st_buf = ctrl->status.buf; |
455 | 481 | ||
456 | if (unlikely(ctrl->card->halt) || | 482 | if (unlikely(ctrl->card->halt) || |
@@ -520,33 +546,32 @@ static void rsxx_dma_done(struct work_struct *work) | |||
520 | rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id)); | 546 | rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id)); |
521 | spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); | 547 | spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); |
522 | 548 | ||
523 | spin_lock(&ctrl->queue_lock); | 549 | spin_lock_bh(&ctrl->queue_lock); |
524 | if (ctrl->stats.sw_q_depth) | 550 | if (ctrl->stats.sw_q_depth) |
525 | queue_work(ctrl->issue_wq, &ctrl->issue_dma_work); | 551 | queue_work(ctrl->issue_wq, &ctrl->issue_dma_work); |
526 | spin_unlock(&ctrl->queue_lock); | 552 | spin_unlock_bh(&ctrl->queue_lock); |
527 | } | 553 | } |
528 | 554 | ||
529 | static int rsxx_cleanup_dma_queue(struct rsxx_cardinfo *card, | 555 | static void rsxx_schedule_issue(struct work_struct *work) |
530 | struct list_head *q) | ||
531 | { | 556 | { |
532 | struct rsxx_dma *dma; | 557 | struct rsxx_dma_ctrl *ctrl; |
533 | struct rsxx_dma *tmp; | ||
534 | int cnt = 0; | ||
535 | 558 | ||
536 | list_for_each_entry_safe(dma, tmp, q, list) { | 559 | ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work); |
537 | list_del(&dma->list); | ||
538 | 560 | ||
539 | if (dma->dma_addr) | 561 | mutex_lock(&ctrl->work_lock); |
540 | pci_unmap_page(card->dev, dma->dma_addr, | 562 | rsxx_issue_dmas(ctrl); |
541 | get_dma_size(dma), | 563 | mutex_unlock(&ctrl->work_lock); |
542 | (dma->cmd == HW_CMD_BLK_WRITE) ? | 564 | } |
543 | PCI_DMA_TODEVICE : | ||
544 | PCI_DMA_FROMDEVICE); | ||
545 | kmem_cache_free(rsxx_dma_pool, dma); | ||
546 | cnt++; | ||
547 | } | ||
548 | 565 | ||
549 | return cnt; | 566 | static void rsxx_schedule_done(struct work_struct *work) |
567 | { | ||
568 | struct rsxx_dma_ctrl *ctrl; | ||
569 | |||
570 | ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work); | ||
571 | |||
572 | mutex_lock(&ctrl->work_lock); | ||
573 | rsxx_dma_done(ctrl); | ||
574 | mutex_unlock(&ctrl->work_lock); | ||
550 | } | 575 | } |
551 | 576 | ||
552 | static int rsxx_queue_discard(struct rsxx_cardinfo *card, | 577 | static int rsxx_queue_discard(struct rsxx_cardinfo *card, |
@@ -698,10 +723,10 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, | |||
698 | 723 | ||
699 | for (i = 0; i < card->n_targets; i++) { | 724 | for (i = 0; i < card->n_targets; i++) { |
700 | if (!list_empty(&dma_list[i])) { | 725 | if (!list_empty(&dma_list[i])) { |
701 | spin_lock(&card->ctrl[i].queue_lock); | 726 | spin_lock_bh(&card->ctrl[i].queue_lock); |
702 | card->ctrl[i].stats.sw_q_depth += dma_cnt[i]; | 727 | card->ctrl[i].stats.sw_q_depth += dma_cnt[i]; |
703 | list_splice_tail(&dma_list[i], &card->ctrl[i].queue); | 728 | list_splice_tail(&dma_list[i], &card->ctrl[i].queue); |
704 | spin_unlock(&card->ctrl[i].queue_lock); | 729 | spin_unlock_bh(&card->ctrl[i].queue_lock); |
705 | 730 | ||
706 | queue_work(card->ctrl[i].issue_wq, | 731 | queue_work(card->ctrl[i].issue_wq, |
707 | &card->ctrl[i].issue_dma_work); | 732 | &card->ctrl[i].issue_dma_work); |
@@ -711,8 +736,11 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, | |||
711 | return 0; | 736 | return 0; |
712 | 737 | ||
713 | bvec_err: | 738 | bvec_err: |
714 | for (i = 0; i < card->n_targets; i++) | 739 | for (i = 0; i < card->n_targets; i++) { |
715 | rsxx_cleanup_dma_queue(card, &dma_list[i]); | 740 | spin_lock_bh(&card->ctrl[i].queue_lock); |
741 | rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i]); | ||
742 | spin_unlock_bh(&card->ctrl[i].queue_lock); | ||
743 | } | ||
716 | 744 | ||
717 | return st; | 745 | return st; |
718 | } | 746 | } |
@@ -780,6 +808,7 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev, | |||
780 | spin_lock_init(&ctrl->trackers->lock); | 808 | spin_lock_init(&ctrl->trackers->lock); |
781 | 809 | ||
782 | spin_lock_init(&ctrl->queue_lock); | 810 | spin_lock_init(&ctrl->queue_lock); |
811 | mutex_init(&ctrl->work_lock); | ||
783 | INIT_LIST_HEAD(&ctrl->queue); | 812 | INIT_LIST_HEAD(&ctrl->queue); |
784 | 813 | ||
785 | setup_timer(&ctrl->activity_timer, dma_engine_stalled, | 814 | setup_timer(&ctrl->activity_timer, dma_engine_stalled, |
@@ -793,8 +822,8 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev, | |||
793 | if (!ctrl->done_wq) | 822 | if (!ctrl->done_wq) |
794 | return -ENOMEM; | 823 | return -ENOMEM; |
795 | 824 | ||
796 | INIT_WORK(&ctrl->issue_dma_work, rsxx_issue_dmas); | 825 | INIT_WORK(&ctrl->issue_dma_work, rsxx_schedule_issue); |
797 | INIT_WORK(&ctrl->dma_done_work, rsxx_dma_done); | 826 | INIT_WORK(&ctrl->dma_done_work, rsxx_schedule_done); |
798 | 827 | ||
799 | st = rsxx_hw_buffers_init(dev, ctrl); | 828 | st = rsxx_hw_buffers_init(dev, ctrl); |
800 | if (st) | 829 | if (st) |
@@ -918,13 +947,30 @@ failed_dma_setup: | |||
918 | return st; | 947 | return st; |
919 | } | 948 | } |
920 | 949 | ||
950 | int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl) | ||
951 | { | ||
952 | struct rsxx_dma *dma; | ||
953 | int i; | ||
954 | int cnt = 0; | ||
955 | |||
956 | /* Clean up issued DMAs */ | ||
957 | for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) { | ||
958 | dma = get_tracker_dma(ctrl->trackers, i); | ||
959 | if (dma) { | ||
960 | atomic_dec(&ctrl->stats.hw_q_depth); | ||
961 | rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); | ||
962 | push_tracker(ctrl->trackers, i); | ||
963 | cnt++; | ||
964 | } | ||
965 | } | ||
966 | |||
967 | return cnt; | ||
968 | } | ||
921 | 969 | ||
922 | void rsxx_dma_destroy(struct rsxx_cardinfo *card) | 970 | void rsxx_dma_destroy(struct rsxx_cardinfo *card) |
923 | { | 971 | { |
924 | struct rsxx_dma_ctrl *ctrl; | 972 | struct rsxx_dma_ctrl *ctrl; |
925 | struct rsxx_dma *dma; | 973 | int i; |
926 | int i, j; | ||
927 | int cnt = 0; | ||
928 | 974 | ||
929 | for (i = 0; i < card->n_targets; i++) { | 975 | for (i = 0; i < card->n_targets; i++) { |
930 | ctrl = &card->ctrl[i]; | 976 | ctrl = &card->ctrl[i]; |
@@ -943,33 +989,11 @@ void rsxx_dma_destroy(struct rsxx_cardinfo *card) | |||
943 | del_timer_sync(&ctrl->activity_timer); | 989 | del_timer_sync(&ctrl->activity_timer); |
944 | 990 | ||
945 | /* Clean up the DMA queue */ | 991 | /* Clean up the DMA queue */ |
946 | spin_lock(&ctrl->queue_lock); | 992 | spin_lock_bh(&ctrl->queue_lock); |
947 | cnt = rsxx_cleanup_dma_queue(card, &ctrl->queue); | 993 | rsxx_cleanup_dma_queue(ctrl, &ctrl->queue); |
948 | spin_unlock(&ctrl->queue_lock); | 994 | spin_unlock_bh(&ctrl->queue_lock); |
949 | |||
950 | if (cnt) | ||
951 | dev_info(CARD_TO_DEV(card), | ||
952 | "Freed %d queued DMAs on channel %d\n", | ||
953 | cnt, i); | ||
954 | |||
955 | /* Clean up issued DMAs */ | ||
956 | for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) { | ||
957 | dma = get_tracker_dma(ctrl->trackers, j); | ||
958 | if (dma) { | ||
959 | pci_unmap_page(card->dev, dma->dma_addr, | ||
960 | get_dma_size(dma), | ||
961 | (dma->cmd == HW_CMD_BLK_WRITE) ? | ||
962 | PCI_DMA_TODEVICE : | ||
963 | PCI_DMA_FROMDEVICE); | ||
964 | kmem_cache_free(rsxx_dma_pool, dma); | ||
965 | cnt++; | ||
966 | } | ||
967 | } | ||
968 | 995 | ||
969 | if (cnt) | 996 | rsxx_dma_cancel(ctrl); |
970 | dev_info(CARD_TO_DEV(card), | ||
971 | "Freed %d pending DMAs on channel %d\n", | ||
972 | cnt, i); | ||
973 | 997 | ||
974 | vfree(ctrl->trackers); | 998 | vfree(ctrl->trackers); |
975 | 999 | ||
@@ -1013,7 +1037,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) | |||
1013 | cnt++; | 1037 | cnt++; |
1014 | } | 1038 | } |
1015 | 1039 | ||
1016 | spin_lock(&card->ctrl[i].queue_lock); | 1040 | spin_lock_bh(&card->ctrl[i].queue_lock); |
1017 | list_splice(&issued_dmas[i], &card->ctrl[i].queue); | 1041 | list_splice(&issued_dmas[i], &card->ctrl[i].queue); |
1018 | 1042 | ||
1019 | atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth); | 1043 | atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth); |
@@ -1028,7 +1052,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) | |||
1028 | PCI_DMA_TODEVICE : | 1052 | PCI_DMA_TODEVICE : |
1029 | PCI_DMA_FROMDEVICE); | 1053 | PCI_DMA_FROMDEVICE); |
1030 | } | 1054 | } |
1031 | spin_unlock(&card->ctrl[i].queue_lock); | 1055 | spin_unlock_bh(&card->ctrl[i].queue_lock); |
1032 | } | 1056 | } |
1033 | 1057 | ||
1034 | kfree(issued_dmas); | 1058 | kfree(issued_dmas); |
@@ -1036,30 +1060,13 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) | |||
1036 | return 0; | 1060 | return 0; |
1037 | } | 1061 | } |
1038 | 1062 | ||
1039 | void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card) | ||
1040 | { | ||
1041 | struct rsxx_dma *dma; | ||
1042 | struct rsxx_dma *tmp; | ||
1043 | int i; | ||
1044 | |||
1045 | for (i = 0; i < card->n_targets; i++) { | ||
1046 | spin_lock(&card->ctrl[i].queue_lock); | ||
1047 | list_for_each_entry_safe(dma, tmp, &card->ctrl[i].queue, list) { | ||
1048 | list_del(&dma->list); | ||
1049 | |||
1050 | rsxx_complete_dma(&card->ctrl[i], dma, DMA_CANCELLED); | ||
1051 | } | ||
1052 | spin_unlock(&card->ctrl[i].queue_lock); | ||
1053 | } | ||
1054 | } | ||
1055 | |||
1056 | int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card) | 1063 | int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card) |
1057 | { | 1064 | { |
1058 | struct rsxx_dma *dma; | 1065 | struct rsxx_dma *dma; |
1059 | int i; | 1066 | int i; |
1060 | 1067 | ||
1061 | for (i = 0; i < card->n_targets; i++) { | 1068 | for (i = 0; i < card->n_targets; i++) { |
1062 | spin_lock(&card->ctrl[i].queue_lock); | 1069 | spin_lock_bh(&card->ctrl[i].queue_lock); |
1063 | list_for_each_entry(dma, &card->ctrl[i].queue, list) { | 1070 | list_for_each_entry(dma, &card->ctrl[i].queue, list) { |
1064 | dma->dma_addr = pci_map_page(card->dev, dma->page, | 1071 | dma->dma_addr = pci_map_page(card->dev, dma->page, |
1065 | dma->pg_off, get_dma_size(dma), | 1072 | dma->pg_off, get_dma_size(dma), |
@@ -1067,12 +1074,12 @@ int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card) | |||
1067 | PCI_DMA_TODEVICE : | 1074 | PCI_DMA_TODEVICE : |
1068 | PCI_DMA_FROMDEVICE); | 1075 | PCI_DMA_FROMDEVICE); |
1069 | if (!dma->dma_addr) { | 1076 | if (!dma->dma_addr) { |
1070 | spin_unlock(&card->ctrl[i].queue_lock); | 1077 | spin_unlock_bh(&card->ctrl[i].queue_lock); |
1071 | kmem_cache_free(rsxx_dma_pool, dma); | 1078 | kmem_cache_free(rsxx_dma_pool, dma); |
1072 | return -ENOMEM; | 1079 | return -ENOMEM; |
1073 | } | 1080 | } |
1074 | } | 1081 | } |
1075 | spin_unlock(&card->ctrl[i].queue_lock); | 1082 | spin_unlock_bh(&card->ctrl[i].queue_lock); |
1076 | } | 1083 | } |
1077 | 1084 | ||
1078 | return 0; | 1085 | return 0; |
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h index 382e8bf5c03b..5ad5055a4104 100644 --- a/drivers/block/rsxx/rsxx_priv.h +++ b/drivers/block/rsxx/rsxx_priv.h | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/vmalloc.h> | 39 | #include <linux/vmalloc.h> |
40 | #include <linux/timer.h> | 40 | #include <linux/timer.h> |
41 | #include <linux/ioctl.h> | 41 | #include <linux/ioctl.h> |
42 | #include <linux/delay.h> | ||
42 | 43 | ||
43 | #include "rsxx.h" | 44 | #include "rsxx.h" |
44 | #include "rsxx_cfg.h" | 45 | #include "rsxx_cfg.h" |
@@ -114,6 +115,7 @@ struct rsxx_dma_ctrl { | |||
114 | struct timer_list activity_timer; | 115 | struct timer_list activity_timer; |
115 | struct dma_tracker_list *trackers; | 116 | struct dma_tracker_list *trackers; |
116 | struct rsxx_dma_stats stats; | 117 | struct rsxx_dma_stats stats; |
118 | struct mutex work_lock; | ||
117 | }; | 119 | }; |
118 | 120 | ||
119 | struct rsxx_cardinfo { | 121 | struct rsxx_cardinfo { |
@@ -134,6 +136,7 @@ struct rsxx_cardinfo { | |||
134 | spinlock_t lock; | 136 | spinlock_t lock; |
135 | bool active; | 137 | bool active; |
136 | struct creg_cmd *active_cmd; | 138 | struct creg_cmd *active_cmd; |
139 | struct workqueue_struct *creg_wq; | ||
137 | struct work_struct done_work; | 140 | struct work_struct done_work; |
138 | struct list_head queue; | 141 | struct list_head queue; |
139 | unsigned int q_depth; | 142 | unsigned int q_depth; |
@@ -154,6 +157,7 @@ struct rsxx_cardinfo { | |||
154 | int buf_len; | 157 | int buf_len; |
155 | } log; | 158 | } log; |
156 | 159 | ||
160 | struct workqueue_struct *event_wq; | ||
157 | struct work_struct event_work; | 161 | struct work_struct event_work; |
158 | unsigned int state; | 162 | unsigned int state; |
159 | u64 size8; | 163 | u64 size8; |
@@ -181,6 +185,8 @@ struct rsxx_cardinfo { | |||
181 | 185 | ||
182 | int n_targets; | 186 | int n_targets; |
183 | struct rsxx_dma_ctrl *ctrl; | 187 | struct rsxx_dma_ctrl *ctrl; |
188 | |||
189 | struct dentry *debugfs_dir; | ||
184 | }; | 190 | }; |
185 | 191 | ||
186 | enum rsxx_pci_regmap { | 192 | enum rsxx_pci_regmap { |
@@ -283,6 +289,7 @@ enum rsxx_creg_addr { | |||
283 | CREG_ADD_CAPABILITIES = 0x80001050, | 289 | CREG_ADD_CAPABILITIES = 0x80001050, |
284 | CREG_ADD_LOG = 0x80002000, | 290 | CREG_ADD_LOG = 0x80002000, |
285 | CREG_ADD_NUM_TARGETS = 0x80003000, | 291 | CREG_ADD_NUM_TARGETS = 0x80003000, |
292 | CREG_ADD_CRAM = 0xA0000000, | ||
286 | CREG_ADD_CONFIG = 0xB0000000, | 293 | CREG_ADD_CONFIG = 0xB0000000, |
287 | }; | 294 | }; |
288 | 295 | ||
@@ -372,6 +379,8 @@ typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card, | |||
372 | int rsxx_dma_setup(struct rsxx_cardinfo *card); | 379 | int rsxx_dma_setup(struct rsxx_cardinfo *card); |
373 | void rsxx_dma_destroy(struct rsxx_cardinfo *card); | 380 | void rsxx_dma_destroy(struct rsxx_cardinfo *card); |
374 | int rsxx_dma_init(void); | 381 | int rsxx_dma_init(void); |
382 | int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, struct list_head *q); | ||
383 | int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl); | ||
375 | void rsxx_dma_cleanup(void); | 384 | void rsxx_dma_cleanup(void); |
376 | void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); | 385 | void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); |
377 | int rsxx_dma_configure(struct rsxx_cardinfo *card); | 386 | int rsxx_dma_configure(struct rsxx_cardinfo *card); |
@@ -382,7 +391,6 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, | |||
382 | void *cb_data); | 391 | void *cb_data); |
383 | int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl); | 392 | int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl); |
384 | int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card); | 393 | int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card); |
385 | void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card); | ||
386 | int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card); | 394 | int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card); |
387 | 395 | ||
388 | /***** cregs.c *****/ | 396 | /***** cregs.c *****/ |
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index dd5b2fed97e9..bf4b9d282c04 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c | |||
@@ -50,110 +50,118 @@ | |||
50 | #include "common.h" | 50 | #include "common.h" |
51 | 51 | ||
52 | /* | 52 | /* |
53 | * These are rather arbitrary. They are fairly large because adjacent requests | 53 | * Maximum number of unused free pages to keep in the internal buffer. |
54 | * pulled from a communication ring are quite likely to end up being part of | 54 | * Setting this to a value too low will reduce memory used in each backend, |
55 | * the same scatter/gather request at the disc. | 55 | * but can have a performance penalty. |
56 | * | 56 | * |
57 | * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** | 57 | * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can |
58 | * | 58 | * be set to a lower value that might degrade performance on some intensive |
59 | * This will increase the chances of being able to write whole tracks. | 59 | * IO workloads. |
60 | * 64 should be enough to keep us competitive with Linux. | ||
61 | */ | 60 | */ |
62 | static int xen_blkif_reqs = 64; | ||
63 | module_param_named(reqs, xen_blkif_reqs, int, 0); | ||
64 | MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate"); | ||
65 | 61 | ||
66 | /* Run-time switchable: /sys/module/blkback/parameters/ */ | 62 | static int xen_blkif_max_buffer_pages = 1024; |
67 | static unsigned int log_stats; | 63 | module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644); |
68 | module_param(log_stats, int, 0644); | 64 | MODULE_PARM_DESC(max_buffer_pages, |
65 | "Maximum number of free pages to keep in each block backend buffer"); | ||
69 | 66 | ||
70 | /* | 67 | /* |
71 | * Each outstanding request that we've passed to the lower device layers has a | 68 | * Maximum number of grants to map persistently in blkback. For maximum |
72 | * 'pending_req' allocated to it. Each buffer_head that completes decrements | 69 | * performance this should be the total numbers of grants that can be used |
73 | * the pendcnt towards zero. When it hits zero, the specified domain has a | 70 | * to fill the ring, but since this might become too high, specially with |
74 | * response queued for it, with the saved 'id' passed back. | 71 | * the use of indirect descriptors, we set it to a value that provides good |
72 | * performance without using too much memory. | ||
73 | * | ||
74 | * When the list of persistent grants is full we clean it up using a LRU | ||
75 | * algorithm. | ||
75 | */ | 76 | */ |
76 | struct pending_req { | ||
77 | struct xen_blkif *blkif; | ||
78 | u64 id; | ||
79 | int nr_pages; | ||
80 | atomic_t pendcnt; | ||
81 | unsigned short operation; | ||
82 | int status; | ||
83 | struct list_head free_list; | ||
84 | DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
85 | }; | ||
86 | 77 | ||
87 | #define BLKBACK_INVALID_HANDLE (~0) | 78 | static int xen_blkif_max_pgrants = 1056; |
79 | module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644); | ||
80 | MODULE_PARM_DESC(max_persistent_grants, | ||
81 | "Maximum number of grants to map persistently"); | ||
88 | 82 | ||
89 | struct xen_blkbk { | 83 | /* |
90 | struct pending_req *pending_reqs; | 84 | * The LRU mechanism to clean the lists of persistent grants needs to |
91 | /* List of all 'pending_req' available */ | 85 | * be executed periodically. The time interval between consecutive executions |
92 | struct list_head pending_free; | 86 | * of the purge mechanism is set in ms. |
93 | /* And its spinlock. */ | 87 | */ |
94 | spinlock_t pending_free_lock; | 88 | #define LRU_INTERVAL 100 |
95 | wait_queue_head_t pending_free_wq; | ||
96 | /* The list of all pages that are available. */ | ||
97 | struct page **pending_pages; | ||
98 | /* And the grant handles that are available. */ | ||
99 | grant_handle_t *pending_grant_handles; | ||
100 | }; | ||
101 | |||
102 | static struct xen_blkbk *blkbk; | ||
103 | 89 | ||
104 | /* | 90 | /* |
105 | * Maximum number of grant pages that can be mapped in blkback. | 91 | * When the persistent grants list is full we will remove unused grants |
106 | * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of | 92 | * from the list. The percent number of grants to be removed at each LRU |
107 | * pages that blkback will persistently map. | 93 | * execution. |
108 | * Currently, this is: | ||
109 | * RING_SIZE = 32 (for all known ring types) | ||
110 | * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11 | ||
111 | * sizeof(struct persistent_gnt) = 48 | ||
112 | * So the maximum memory used to store the grants is: | ||
113 | * 32 * 11 * 48 = 16896 bytes | ||
114 | */ | 94 | */ |
115 | static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol) | 95 | #define LRU_PERCENT_CLEAN 5 |
96 | |||
97 | /* Run-time switchable: /sys/module/blkback/parameters/ */ | ||
98 | static unsigned int log_stats; | ||
99 | module_param(log_stats, int, 0644); | ||
100 | |||
101 | #define BLKBACK_INVALID_HANDLE (~0) | ||
102 | |||
103 | /* Number of free pages to remove on each call to free_xenballooned_pages */ | ||
104 | #define NUM_BATCH_FREE_PAGES 10 | ||
105 | |||
106 | static inline int get_free_page(struct xen_blkif *blkif, struct page **page) | ||
116 | { | 107 | { |
117 | switch (protocol) { | 108 | unsigned long flags; |
118 | case BLKIF_PROTOCOL_NATIVE: | 109 | |
119 | return __CONST_RING_SIZE(blkif, PAGE_SIZE) * | 110 | spin_lock_irqsave(&blkif->free_pages_lock, flags); |
120 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | 111 | if (list_empty(&blkif->free_pages)) { |
121 | case BLKIF_PROTOCOL_X86_32: | 112 | BUG_ON(blkif->free_pages_num != 0); |
122 | return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) * | 113 | spin_unlock_irqrestore(&blkif->free_pages_lock, flags); |
123 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | 114 | return alloc_xenballooned_pages(1, page, false); |
124 | case BLKIF_PROTOCOL_X86_64: | ||
125 | return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) * | ||
126 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
127 | default: | ||
128 | BUG(); | ||
129 | } | 115 | } |
116 | BUG_ON(blkif->free_pages_num == 0); | ||
117 | page[0] = list_first_entry(&blkif->free_pages, struct page, lru); | ||
118 | list_del(&page[0]->lru); | ||
119 | blkif->free_pages_num--; | ||
120 | spin_unlock_irqrestore(&blkif->free_pages_lock, flags); | ||
121 | |||
130 | return 0; | 122 | return 0; |
131 | } | 123 | } |
132 | 124 | ||
133 | 125 | static inline void put_free_pages(struct xen_blkif *blkif, struct page **page, | |
134 | /* | 126 | int num) |
135 | * Little helpful macro to figure out the index and virtual address of the | ||
136 | * pending_pages[..]. For each 'pending_req' we have have up to | ||
137 | * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through | ||
138 | * 10 and would index in the pending_pages[..]. | ||
139 | */ | ||
140 | static inline int vaddr_pagenr(struct pending_req *req, int seg) | ||
141 | { | 127 | { |
142 | return (req - blkbk->pending_reqs) * | 128 | unsigned long flags; |
143 | BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; | 129 | int i; |
144 | } | ||
145 | 130 | ||
146 | #define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)] | 131 | spin_lock_irqsave(&blkif->free_pages_lock, flags); |
132 | for (i = 0; i < num; i++) | ||
133 | list_add(&page[i]->lru, &blkif->free_pages); | ||
134 | blkif->free_pages_num += num; | ||
135 | spin_unlock_irqrestore(&blkif->free_pages_lock, flags); | ||
136 | } | ||
147 | 137 | ||
148 | static inline unsigned long vaddr(struct pending_req *req, int seg) | 138 | static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) |
149 | { | 139 | { |
150 | unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg)); | 140 | /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ |
151 | return (unsigned long)pfn_to_kaddr(pfn); | 141 | struct page *page[NUM_BATCH_FREE_PAGES]; |
152 | } | 142 | unsigned int num_pages = 0; |
143 | unsigned long flags; | ||
153 | 144 | ||
154 | #define pending_handle(_req, _seg) \ | 145 | spin_lock_irqsave(&blkif->free_pages_lock, flags); |
155 | (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)]) | 146 | while (blkif->free_pages_num > num) { |
147 | BUG_ON(list_empty(&blkif->free_pages)); | ||
148 | page[num_pages] = list_first_entry(&blkif->free_pages, | ||
149 | struct page, lru); | ||
150 | list_del(&page[num_pages]->lru); | ||
151 | blkif->free_pages_num--; | ||
152 | if (++num_pages == NUM_BATCH_FREE_PAGES) { | ||
153 | spin_unlock_irqrestore(&blkif->free_pages_lock, flags); | ||
154 | free_xenballooned_pages(num_pages, page); | ||
155 | spin_lock_irqsave(&blkif->free_pages_lock, flags); | ||
156 | num_pages = 0; | ||
157 | } | ||
158 | } | ||
159 | spin_unlock_irqrestore(&blkif->free_pages_lock, flags); | ||
160 | if (num_pages != 0) | ||
161 | free_xenballooned_pages(num_pages, page); | ||
162 | } | ||
156 | 163 | ||
164 | #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) | ||
157 | 165 | ||
158 | static int do_block_io_op(struct xen_blkif *blkif); | 166 | static int do_block_io_op(struct xen_blkif *blkif); |
159 | static int dispatch_rw_block_io(struct xen_blkif *blkif, | 167 | static int dispatch_rw_block_io(struct xen_blkif *blkif, |
@@ -170,13 +178,29 @@ static void make_response(struct xen_blkif *blkif, u64 id, | |||
170 | (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL) | 178 | (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL) |
171 | 179 | ||
172 | 180 | ||
173 | static void add_persistent_gnt(struct rb_root *root, | 181 | /* |
182 | * We don't need locking around the persistent grant helpers | ||
183 | * because blkback uses a single-thread for each backed, so we | ||
184 | * can be sure that this functions will never be called recursively. | ||
185 | * | ||
186 | * The only exception to that is put_persistent_grant, that can be called | ||
187 | * from interrupt context (by xen_blkbk_unmap), so we have to use atomic | ||
188 | * bit operations to modify the flags of a persistent grant and to count | ||
189 | * the number of used grants. | ||
190 | */ | ||
191 | static int add_persistent_gnt(struct xen_blkif *blkif, | ||
174 | struct persistent_gnt *persistent_gnt) | 192 | struct persistent_gnt *persistent_gnt) |
175 | { | 193 | { |
176 | struct rb_node **new = &(root->rb_node), *parent = NULL; | 194 | struct rb_node **new = NULL, *parent = NULL; |
177 | struct persistent_gnt *this; | 195 | struct persistent_gnt *this; |
178 | 196 | ||
197 | if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { | ||
198 | if (!blkif->vbd.overflow_max_grants) | ||
199 | blkif->vbd.overflow_max_grants = 1; | ||
200 | return -EBUSY; | ||
201 | } | ||
179 | /* Figure out where to put new node */ | 202 | /* Figure out where to put new node */ |
203 | new = &blkif->persistent_gnts.rb_node; | ||
180 | while (*new) { | 204 | while (*new) { |
181 | this = container_of(*new, struct persistent_gnt, node); | 205 | this = container_of(*new, struct persistent_gnt, node); |
182 | 206 | ||
@@ -186,22 +210,28 @@ static void add_persistent_gnt(struct rb_root *root, | |||
186 | else if (persistent_gnt->gnt > this->gnt) | 210 | else if (persistent_gnt->gnt > this->gnt) |
187 | new = &((*new)->rb_right); | 211 | new = &((*new)->rb_right); |
188 | else { | 212 | else { |
189 | pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n"); | 213 | pr_alert_ratelimited(DRV_PFX " trying to add a gref that's already in the tree\n"); |
190 | BUG(); | 214 | return -EINVAL; |
191 | } | 215 | } |
192 | } | 216 | } |
193 | 217 | ||
218 | bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE); | ||
219 | set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); | ||
194 | /* Add new node and rebalance tree. */ | 220 | /* Add new node and rebalance tree. */ |
195 | rb_link_node(&(persistent_gnt->node), parent, new); | 221 | rb_link_node(&(persistent_gnt->node), parent, new); |
196 | rb_insert_color(&(persistent_gnt->node), root); | 222 | rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts); |
223 | blkif->persistent_gnt_c++; | ||
224 | atomic_inc(&blkif->persistent_gnt_in_use); | ||
225 | return 0; | ||
197 | } | 226 | } |
198 | 227 | ||
199 | static struct persistent_gnt *get_persistent_gnt(struct rb_root *root, | 228 | static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, |
200 | grant_ref_t gref) | 229 | grant_ref_t gref) |
201 | { | 230 | { |
202 | struct persistent_gnt *data; | 231 | struct persistent_gnt *data; |
203 | struct rb_node *node = root->rb_node; | 232 | struct rb_node *node = NULL; |
204 | 233 | ||
234 | node = blkif->persistent_gnts.rb_node; | ||
205 | while (node) { | 235 | while (node) { |
206 | data = container_of(node, struct persistent_gnt, node); | 236 | data = container_of(node, struct persistent_gnt, node); |
207 | 237 | ||
@@ -209,13 +239,31 @@ static struct persistent_gnt *get_persistent_gnt(struct rb_root *root, | |||
209 | node = node->rb_left; | 239 | node = node->rb_left; |
210 | else if (gref > data->gnt) | 240 | else if (gref > data->gnt) |
211 | node = node->rb_right; | 241 | node = node->rb_right; |
212 | else | 242 | else { |
243 | if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) { | ||
244 | pr_alert_ratelimited(DRV_PFX " requesting a grant already in use\n"); | ||
245 | return NULL; | ||
246 | } | ||
247 | set_bit(PERSISTENT_GNT_ACTIVE, data->flags); | ||
248 | atomic_inc(&blkif->persistent_gnt_in_use); | ||
213 | return data; | 249 | return data; |
250 | } | ||
214 | } | 251 | } |
215 | return NULL; | 252 | return NULL; |
216 | } | 253 | } |
217 | 254 | ||
218 | static void free_persistent_gnts(struct rb_root *root, unsigned int num) | 255 | static void put_persistent_gnt(struct xen_blkif *blkif, |
256 | struct persistent_gnt *persistent_gnt) | ||
257 | { | ||
258 | if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) | ||
259 | pr_alert_ratelimited(DRV_PFX " freeing a grant already unused"); | ||
260 | set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); | ||
261 | clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); | ||
262 | atomic_dec(&blkif->persistent_gnt_in_use); | ||
263 | } | ||
264 | |||
265 | static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, | ||
266 | unsigned int num) | ||
219 | { | 267 | { |
220 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 268 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
221 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 269 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
@@ -240,7 +288,7 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num) | |||
240 | ret = gnttab_unmap_refs(unmap, NULL, pages, | 288 | ret = gnttab_unmap_refs(unmap, NULL, pages, |
241 | segs_to_unmap); | 289 | segs_to_unmap); |
242 | BUG_ON(ret); | 290 | BUG_ON(ret); |
243 | free_xenballooned_pages(segs_to_unmap, pages); | 291 | put_free_pages(blkif, pages, segs_to_unmap); |
244 | segs_to_unmap = 0; | 292 | segs_to_unmap = 0; |
245 | } | 293 | } |
246 | 294 | ||
@@ -251,21 +299,148 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num) | |||
251 | BUG_ON(num != 0); | 299 | BUG_ON(num != 0); |
252 | } | 300 | } |
253 | 301 | ||
302 | static void unmap_purged_grants(struct work_struct *work) | ||
303 | { | ||
304 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
305 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
306 | struct persistent_gnt *persistent_gnt; | ||
307 | int ret, segs_to_unmap = 0; | ||
308 | struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); | ||
309 | |||
310 | while(!list_empty(&blkif->persistent_purge_list)) { | ||
311 | persistent_gnt = list_first_entry(&blkif->persistent_purge_list, | ||
312 | struct persistent_gnt, | ||
313 | remove_node); | ||
314 | list_del(&persistent_gnt->remove_node); | ||
315 | |||
316 | gnttab_set_unmap_op(&unmap[segs_to_unmap], | ||
317 | vaddr(persistent_gnt->page), | ||
318 | GNTMAP_host_map, | ||
319 | persistent_gnt->handle); | ||
320 | |||
321 | pages[segs_to_unmap] = persistent_gnt->page; | ||
322 | |||
323 | if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { | ||
324 | ret = gnttab_unmap_refs(unmap, NULL, pages, | ||
325 | segs_to_unmap); | ||
326 | BUG_ON(ret); | ||
327 | put_free_pages(blkif, pages, segs_to_unmap); | ||
328 | segs_to_unmap = 0; | ||
329 | } | ||
330 | kfree(persistent_gnt); | ||
331 | } | ||
332 | if (segs_to_unmap > 0) { | ||
333 | ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap); | ||
334 | BUG_ON(ret); | ||
335 | put_free_pages(blkif, pages, segs_to_unmap); | ||
336 | } | ||
337 | } | ||
338 | |||
339 | static void purge_persistent_gnt(struct xen_blkif *blkif) | ||
340 | { | ||
341 | struct persistent_gnt *persistent_gnt; | ||
342 | struct rb_node *n; | ||
343 | unsigned int num_clean, total; | ||
344 | bool scan_used = false, clean_used = false; | ||
345 | struct rb_root *root; | ||
346 | |||
347 | if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || | ||
348 | (blkif->persistent_gnt_c == xen_blkif_max_pgrants && | ||
349 | !blkif->vbd.overflow_max_grants)) { | ||
350 | return; | ||
351 | } | ||
352 | |||
353 | if (work_pending(&blkif->persistent_purge_work)) { | ||
354 | pr_alert_ratelimited(DRV_PFX "Scheduled work from previous purge is still pending, cannot purge list\n"); | ||
355 | return; | ||
356 | } | ||
357 | |||
358 | num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; | ||
359 | num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; | ||
360 | num_clean = min(blkif->persistent_gnt_c, num_clean); | ||
361 | if ((num_clean == 0) || | ||
362 | (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) | ||
363 | return; | ||
364 | |||
365 | /* | ||
366 | * At this point, we can assure that there will be no calls | ||
367 | * to get_persistent_grant (because we are executing this code from | ||
368 | * xen_blkif_schedule), there can only be calls to put_persistent_gnt, | ||
369 | * which means that the number of currently used grants will go down, | ||
370 | * but never up, so we will always be able to remove the requested | ||
371 | * number of grants. | ||
372 | */ | ||
373 | |||
374 | total = num_clean; | ||
375 | |||
376 | pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean); | ||
377 | |||
378 | INIT_LIST_HEAD(&blkif->persistent_purge_list); | ||
379 | root = &blkif->persistent_gnts; | ||
380 | purge_list: | ||
381 | foreach_grant_safe(persistent_gnt, n, root, node) { | ||
382 | BUG_ON(persistent_gnt->handle == | ||
383 | BLKBACK_INVALID_HANDLE); | ||
384 | |||
385 | if (clean_used) { | ||
386 | clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); | ||
387 | continue; | ||
388 | } | ||
389 | |||
390 | if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) | ||
391 | continue; | ||
392 | if (!scan_used && | ||
393 | (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags))) | ||
394 | continue; | ||
395 | |||
396 | rb_erase(&persistent_gnt->node, root); | ||
397 | list_add(&persistent_gnt->remove_node, | ||
398 | &blkif->persistent_purge_list); | ||
399 | if (--num_clean == 0) | ||
400 | goto finished; | ||
401 | } | ||
402 | /* | ||
403 | * If we get here it means we also need to start cleaning | ||
404 | * grants that were used since last purge in order to cope | ||
405 | * with the requested num | ||
406 | */ | ||
407 | if (!scan_used && !clean_used) { | ||
408 | pr_debug(DRV_PFX "Still missing %u purged frames\n", num_clean); | ||
409 | scan_used = true; | ||
410 | goto purge_list; | ||
411 | } | ||
412 | finished: | ||
413 | if (!clean_used) { | ||
414 | pr_debug(DRV_PFX "Finished scanning for grants to clean, removing used flag\n"); | ||
415 | clean_used = true; | ||
416 | goto purge_list; | ||
417 | } | ||
418 | |||
419 | blkif->persistent_gnt_c -= (total - num_clean); | ||
420 | blkif->vbd.overflow_max_grants = 0; | ||
421 | |||
422 | /* We can defer this work */ | ||
423 | INIT_WORK(&blkif->persistent_purge_work, unmap_purged_grants); | ||
424 | schedule_work(&blkif->persistent_purge_work); | ||
425 | pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total); | ||
426 | return; | ||
427 | } | ||
428 | |||
254 | /* | 429 | /* |
255 | * Retrieve from the 'pending_reqs' a free pending_req structure to be used. | 430 | * Retrieve from the 'pending_reqs' a free pending_req structure to be used. |
256 | */ | 431 | */ |
257 | static struct pending_req *alloc_req(void) | 432 | static struct pending_req *alloc_req(struct xen_blkif *blkif) |
258 | { | 433 | { |
259 | struct pending_req *req = NULL; | 434 | struct pending_req *req = NULL; |
260 | unsigned long flags; | 435 | unsigned long flags; |
261 | 436 | ||
262 | spin_lock_irqsave(&blkbk->pending_free_lock, flags); | 437 | spin_lock_irqsave(&blkif->pending_free_lock, flags); |
263 | if (!list_empty(&blkbk->pending_free)) { | 438 | if (!list_empty(&blkif->pending_free)) { |
264 | req = list_entry(blkbk->pending_free.next, struct pending_req, | 439 | req = list_entry(blkif->pending_free.next, struct pending_req, |
265 | free_list); | 440 | free_list); |
266 | list_del(&req->free_list); | 441 | list_del(&req->free_list); |
267 | } | 442 | } |
268 | spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); | 443 | spin_unlock_irqrestore(&blkif->pending_free_lock, flags); |
269 | return req; | 444 | return req; |
270 | } | 445 | } |
271 | 446 | ||
@@ -273,17 +448,17 @@ static struct pending_req *alloc_req(void) | |||
273 | * Return the 'pending_req' structure back to the freepool. We also | 448 | * Return the 'pending_req' structure back to the freepool. We also |
274 | * wake up the thread if it was waiting for a free page. | 449 | * wake up the thread if it was waiting for a free page. |
275 | */ | 450 | */ |
276 | static void free_req(struct pending_req *req) | 451 | static void free_req(struct xen_blkif *blkif, struct pending_req *req) |
277 | { | 452 | { |
278 | unsigned long flags; | 453 | unsigned long flags; |
279 | int was_empty; | 454 | int was_empty; |
280 | 455 | ||
281 | spin_lock_irqsave(&blkbk->pending_free_lock, flags); | 456 | spin_lock_irqsave(&blkif->pending_free_lock, flags); |
282 | was_empty = list_empty(&blkbk->pending_free); | 457 | was_empty = list_empty(&blkif->pending_free); |
283 | list_add(&req->free_list, &blkbk->pending_free); | 458 | list_add(&req->free_list, &blkif->pending_free); |
284 | spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); | 459 | spin_unlock_irqrestore(&blkif->pending_free_lock, flags); |
285 | if (was_empty) | 460 | if (was_empty) |
286 | wake_up(&blkbk->pending_free_wq); | 461 | wake_up(&blkif->pending_free_wq); |
287 | } | 462 | } |
288 | 463 | ||
289 | /* | 464 | /* |
@@ -382,10 +557,12 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) | |||
382 | static void print_stats(struct xen_blkif *blkif) | 557 | static void print_stats(struct xen_blkif *blkif) |
383 | { | 558 | { |
384 | pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" | 559 | pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" |
385 | " | ds %4llu\n", | 560 | " | ds %4llu | pg: %4u/%4d\n", |
386 | current->comm, blkif->st_oo_req, | 561 | current->comm, blkif->st_oo_req, |
387 | blkif->st_rd_req, blkif->st_wr_req, | 562 | blkif->st_rd_req, blkif->st_wr_req, |
388 | blkif->st_f_req, blkif->st_ds_req); | 563 | blkif->st_f_req, blkif->st_ds_req, |
564 | blkif->persistent_gnt_c, | ||
565 | xen_blkif_max_pgrants); | ||
389 | blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); | 566 | blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); |
390 | blkif->st_rd_req = 0; | 567 | blkif->st_rd_req = 0; |
391 | blkif->st_wr_req = 0; | 568 | blkif->st_wr_req = 0; |
@@ -397,6 +574,8 @@ int xen_blkif_schedule(void *arg) | |||
397 | { | 574 | { |
398 | struct xen_blkif *blkif = arg; | 575 | struct xen_blkif *blkif = arg; |
399 | struct xen_vbd *vbd = &blkif->vbd; | 576 | struct xen_vbd *vbd = &blkif->vbd; |
577 | unsigned long timeout; | ||
578 | int ret; | ||
400 | 579 | ||
401 | xen_blkif_get(blkif); | 580 | xen_blkif_get(blkif); |
402 | 581 | ||
@@ -406,27 +585,52 @@ int xen_blkif_schedule(void *arg) | |||
406 | if (unlikely(vbd->size != vbd_sz(vbd))) | 585 | if (unlikely(vbd->size != vbd_sz(vbd))) |
407 | xen_vbd_resize(blkif); | 586 | xen_vbd_resize(blkif); |
408 | 587 | ||
409 | wait_event_interruptible( | 588 | timeout = msecs_to_jiffies(LRU_INTERVAL); |
589 | |||
590 | timeout = wait_event_interruptible_timeout( | ||
410 | blkif->wq, | 591 | blkif->wq, |
411 | blkif->waiting_reqs || kthread_should_stop()); | 592 | blkif->waiting_reqs || kthread_should_stop(), |
412 | wait_event_interruptible( | 593 | timeout); |
413 | blkbk->pending_free_wq, | 594 | if (timeout == 0) |
414 | !list_empty(&blkbk->pending_free) || | 595 | goto purge_gnt_list; |
415 | kthread_should_stop()); | 596 | timeout = wait_event_interruptible_timeout( |
597 | blkif->pending_free_wq, | ||
598 | !list_empty(&blkif->pending_free) || | ||
599 | kthread_should_stop(), | ||
600 | timeout); | ||
601 | if (timeout == 0) | ||
602 | goto purge_gnt_list; | ||
416 | 603 | ||
417 | blkif->waiting_reqs = 0; | 604 | blkif->waiting_reqs = 0; |
418 | smp_mb(); /* clear flag *before* checking for work */ | 605 | smp_mb(); /* clear flag *before* checking for work */ |
419 | 606 | ||
420 | if (do_block_io_op(blkif)) | 607 | ret = do_block_io_op(blkif); |
608 | if (ret > 0) | ||
421 | blkif->waiting_reqs = 1; | 609 | blkif->waiting_reqs = 1; |
610 | if (ret == -EACCES) | ||
611 | wait_event_interruptible(blkif->shutdown_wq, | ||
612 | kthread_should_stop()); | ||
613 | |||
614 | purge_gnt_list: | ||
615 | if (blkif->vbd.feature_gnt_persistent && | ||
616 | time_after(jiffies, blkif->next_lru)) { | ||
617 | purge_persistent_gnt(blkif); | ||
618 | blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); | ||
619 | } | ||
620 | |||
621 | /* Shrink if we have more than xen_blkif_max_buffer_pages */ | ||
622 | shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); | ||
422 | 623 | ||
423 | if (log_stats && time_after(jiffies, blkif->st_print)) | 624 | if (log_stats && time_after(jiffies, blkif->st_print)) |
424 | print_stats(blkif); | 625 | print_stats(blkif); |
425 | } | 626 | } |
426 | 627 | ||
628 | /* Since we are shutting down remove all pages from the buffer */ | ||
629 | shrink_free_pagepool(blkif, 0 /* All */); | ||
630 | |||
427 | /* Free all persistent grant pages */ | 631 | /* Free all persistent grant pages */ |
428 | if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) | 632 | if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) |
429 | free_persistent_gnts(&blkif->persistent_gnts, | 633 | free_persistent_gnts(blkif, &blkif->persistent_gnts, |
430 | blkif->persistent_gnt_c); | 634 | blkif->persistent_gnt_c); |
431 | 635 | ||
432 | BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); | 636 | BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); |
@@ -441,148 +645,98 @@ int xen_blkif_schedule(void *arg) | |||
441 | return 0; | 645 | return 0; |
442 | } | 646 | } |
443 | 647 | ||
444 | struct seg_buf { | ||
445 | unsigned int offset; | ||
446 | unsigned int nsec; | ||
447 | }; | ||
448 | /* | 648 | /* |
449 | * Unmap the grant references, and also remove the M2P over-rides | 649 | * Unmap the grant references, and also remove the M2P over-rides |
450 | * used in the 'pending_req'. | 650 | * used in the 'pending_req'. |
451 | */ | 651 | */ |
452 | static void xen_blkbk_unmap(struct pending_req *req) | 652 | static void xen_blkbk_unmap(struct xen_blkif *blkif, |
653 | struct grant_page *pages[], | ||
654 | int num) | ||
453 | { | 655 | { |
454 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 656 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
455 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 657 | struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
456 | unsigned int i, invcount = 0; | 658 | unsigned int i, invcount = 0; |
457 | grant_handle_t handle; | ||
458 | int ret; | 659 | int ret; |
459 | 660 | ||
460 | for (i = 0; i < req->nr_pages; i++) { | 661 | for (i = 0; i < num; i++) { |
461 | if (!test_bit(i, req->unmap_seg)) | 662 | if (pages[i]->persistent_gnt != NULL) { |
663 | put_persistent_gnt(blkif, pages[i]->persistent_gnt); | ||
462 | continue; | 664 | continue; |
463 | handle = pending_handle(req, i); | 665 | } |
464 | if (handle == BLKBACK_INVALID_HANDLE) | 666 | if (pages[i]->handle == BLKBACK_INVALID_HANDLE) |
465 | continue; | 667 | continue; |
466 | gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), | 668 | unmap_pages[invcount] = pages[i]->page; |
467 | GNTMAP_host_map, handle); | 669 | gnttab_set_unmap_op(&unmap[invcount], vaddr(pages[i]->page), |
468 | pending_handle(req, i) = BLKBACK_INVALID_HANDLE; | 670 | GNTMAP_host_map, pages[i]->handle); |
469 | pages[invcount] = virt_to_page(vaddr(req, i)); | 671 | pages[i]->handle = BLKBACK_INVALID_HANDLE; |
470 | invcount++; | 672 | if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) { |
673 | ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, | ||
674 | invcount); | ||
675 | BUG_ON(ret); | ||
676 | put_free_pages(blkif, unmap_pages, invcount); | ||
677 | invcount = 0; | ||
678 | } | ||
679 | } | ||
680 | if (invcount) { | ||
681 | ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); | ||
682 | BUG_ON(ret); | ||
683 | put_free_pages(blkif, unmap_pages, invcount); | ||
471 | } | 684 | } |
472 | |||
473 | ret = gnttab_unmap_refs(unmap, NULL, pages, invcount); | ||
474 | BUG_ON(ret); | ||
475 | } | 685 | } |
476 | 686 | ||
477 | static int xen_blkbk_map(struct blkif_request *req, | 687 | static int xen_blkbk_map(struct xen_blkif *blkif, |
478 | struct pending_req *pending_req, | 688 | struct grant_page *pages[], |
479 | struct seg_buf seg[], | 689 | int num, bool ro) |
480 | struct page *pages[]) | ||
481 | { | 690 | { |
482 | struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 691 | struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
483 | struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
484 | struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 692 | struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
485 | struct persistent_gnt *persistent_gnt = NULL; | 693 | struct persistent_gnt *persistent_gnt = NULL; |
486 | struct xen_blkif *blkif = pending_req->blkif; | ||
487 | phys_addr_t addr = 0; | 694 | phys_addr_t addr = 0; |
488 | int i, j; | 695 | int i, seg_idx, new_map_idx; |
489 | bool new_map; | ||
490 | int nseg = req->u.rw.nr_segments; | ||
491 | int segs_to_map = 0; | 696 | int segs_to_map = 0; |
492 | int ret = 0; | 697 | int ret = 0; |
698 | int last_map = 0, map_until = 0; | ||
493 | int use_persistent_gnts; | 699 | int use_persistent_gnts; |
494 | 700 | ||
495 | use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); | 701 | use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); |
496 | 702 | ||
497 | BUG_ON(blkif->persistent_gnt_c > | ||
498 | max_mapped_grant_pages(pending_req->blkif->blk_protocol)); | ||
499 | |||
500 | /* | 703 | /* |
501 | * Fill out preq.nr_sects with proper amount of sectors, and setup | 704 | * Fill out preq.nr_sects with proper amount of sectors, and setup |
502 | * assign map[..] with the PFN of the page in our domain with the | 705 | * assign map[..] with the PFN of the page in our domain with the |
503 | * corresponding grant reference for each page. | 706 | * corresponding grant reference for each page. |
504 | */ | 707 | */ |
505 | for (i = 0; i < nseg; i++) { | 708 | again: |
709 | for (i = map_until; i < num; i++) { | ||
506 | uint32_t flags; | 710 | uint32_t flags; |
507 | 711 | ||
508 | if (use_persistent_gnts) | 712 | if (use_persistent_gnts) |
509 | persistent_gnt = get_persistent_gnt( | 713 | persistent_gnt = get_persistent_gnt( |
510 | &blkif->persistent_gnts, | 714 | blkif, |
511 | req->u.rw.seg[i].gref); | 715 | pages[i]->gref); |
512 | 716 | ||
513 | if (persistent_gnt) { | 717 | if (persistent_gnt) { |
514 | /* | 718 | /* |
515 | * We are using persistent grants and | 719 | * We are using persistent grants and |
516 | * the grant is already mapped | 720 | * the grant is already mapped |
517 | */ | 721 | */ |
518 | new_map = false; | 722 | pages[i]->page = persistent_gnt->page; |
519 | } else if (use_persistent_gnts && | 723 | pages[i]->persistent_gnt = persistent_gnt; |
520 | blkif->persistent_gnt_c < | ||
521 | max_mapped_grant_pages(blkif->blk_protocol)) { | ||
522 | /* | ||
523 | * We are using persistent grants, the grant is | ||
524 | * not mapped but we have room for it | ||
525 | */ | ||
526 | new_map = true; | ||
527 | persistent_gnt = kmalloc( | ||
528 | sizeof(struct persistent_gnt), | ||
529 | GFP_KERNEL); | ||
530 | if (!persistent_gnt) | ||
531 | return -ENOMEM; | ||
532 | if (alloc_xenballooned_pages(1, &persistent_gnt->page, | ||
533 | false)) { | ||
534 | kfree(persistent_gnt); | ||
535 | return -ENOMEM; | ||
536 | } | ||
537 | persistent_gnt->gnt = req->u.rw.seg[i].gref; | ||
538 | persistent_gnt->handle = BLKBACK_INVALID_HANDLE; | ||
539 | |||
540 | pages_to_gnt[segs_to_map] = | ||
541 | persistent_gnt->page; | ||
542 | addr = (unsigned long) pfn_to_kaddr( | ||
543 | page_to_pfn(persistent_gnt->page)); | ||
544 | |||
545 | add_persistent_gnt(&blkif->persistent_gnts, | ||
546 | persistent_gnt); | ||
547 | blkif->persistent_gnt_c++; | ||
548 | pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n", | ||
549 | persistent_gnt->gnt, blkif->persistent_gnt_c, | ||
550 | max_mapped_grant_pages(blkif->blk_protocol)); | ||
551 | } else { | 724 | } else { |
552 | /* | 725 | if (get_free_page(blkif, &pages[i]->page)) |
553 | * We are either using persistent grants and | 726 | goto out_of_memory; |
554 | * hit the maximum limit of grants mapped, | 727 | addr = vaddr(pages[i]->page); |
555 | * or we are not using persistent grants. | 728 | pages_to_gnt[segs_to_map] = pages[i]->page; |
556 | */ | 729 | pages[i]->persistent_gnt = NULL; |
557 | if (use_persistent_gnts && | ||
558 | !blkif->vbd.overflow_max_grants) { | ||
559 | blkif->vbd.overflow_max_grants = 1; | ||
560 | pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n", | ||
561 | blkif->domid, blkif->vbd.handle); | ||
562 | } | ||
563 | new_map = true; | ||
564 | pages[i] = blkbk->pending_page(pending_req, i); | ||
565 | addr = vaddr(pending_req, i); | ||
566 | pages_to_gnt[segs_to_map] = | ||
567 | blkbk->pending_page(pending_req, i); | ||
568 | } | ||
569 | |||
570 | if (persistent_gnt) { | ||
571 | pages[i] = persistent_gnt->page; | ||
572 | persistent_gnts[i] = persistent_gnt; | ||
573 | } else { | ||
574 | persistent_gnts[i] = NULL; | ||
575 | } | ||
576 | |||
577 | if (new_map) { | ||
578 | flags = GNTMAP_host_map; | 730 | flags = GNTMAP_host_map; |
579 | if (!persistent_gnt && | 731 | if (!use_persistent_gnts && ro) |
580 | (pending_req->operation != BLKIF_OP_READ)) | ||
581 | flags |= GNTMAP_readonly; | 732 | flags |= GNTMAP_readonly; |
582 | gnttab_set_map_op(&map[segs_to_map++], addr, | 733 | gnttab_set_map_op(&map[segs_to_map++], addr, |
583 | flags, req->u.rw.seg[i].gref, | 734 | flags, pages[i]->gref, |
584 | blkif->domid); | 735 | blkif->domid); |
585 | } | 736 | } |
737 | map_until = i + 1; | ||
738 | if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST) | ||
739 | break; | ||
586 | } | 740 | } |
587 | 741 | ||
588 | if (segs_to_map) { | 742 | if (segs_to_map) { |
@@ -595,49 +749,133 @@ static int xen_blkbk_map(struct blkif_request *req, | |||
595 | * so that when we access vaddr(pending_req,i) it has the contents of | 749 | * so that when we access vaddr(pending_req,i) it has the contents of |
596 | * the page from the other domain. | 750 | * the page from the other domain. |
597 | */ | 751 | */ |
598 | bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); | 752 | for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) { |
599 | for (i = 0, j = 0; i < nseg; i++) { | 753 | if (!pages[seg_idx]->persistent_gnt) { |
600 | if (!persistent_gnts[i] || | ||
601 | persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) { | ||
602 | /* This is a newly mapped grant */ | 754 | /* This is a newly mapped grant */ |
603 | BUG_ON(j >= segs_to_map); | 755 | BUG_ON(new_map_idx >= segs_to_map); |
604 | if (unlikely(map[j].status != 0)) { | 756 | if (unlikely(map[new_map_idx].status != 0)) { |
605 | pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); | 757 | pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); |
606 | map[j].handle = BLKBACK_INVALID_HANDLE; | 758 | pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; |
607 | ret |= 1; | 759 | ret |= 1; |
608 | if (persistent_gnts[i]) { | 760 | goto next; |
609 | rb_erase(&persistent_gnts[i]->node, | ||
610 | &blkif->persistent_gnts); | ||
611 | blkif->persistent_gnt_c--; | ||
612 | kfree(persistent_gnts[i]); | ||
613 | persistent_gnts[i] = NULL; | ||
614 | } | ||
615 | } | 761 | } |
762 | pages[seg_idx]->handle = map[new_map_idx].handle; | ||
763 | } else { | ||
764 | continue; | ||
616 | } | 765 | } |
617 | if (persistent_gnts[i]) { | 766 | if (use_persistent_gnts && |
618 | if (persistent_gnts[i]->handle == | 767 | blkif->persistent_gnt_c < xen_blkif_max_pgrants) { |
619 | BLKBACK_INVALID_HANDLE) { | 768 | /* |
769 | * We are using persistent grants, the grant is | ||
770 | * not mapped but we might have room for it. | ||
771 | */ | ||
772 | persistent_gnt = kmalloc(sizeof(struct persistent_gnt), | ||
773 | GFP_KERNEL); | ||
774 | if (!persistent_gnt) { | ||
620 | /* | 775 | /* |
621 | * If this is a new persistent grant | 776 | * If we don't have enough memory to |
622 | * save the handler | 777 | * allocate the persistent_gnt struct |
778 | * map this grant non-persistenly | ||
623 | */ | 779 | */ |
624 | persistent_gnts[i]->handle = map[j++].handle; | 780 | goto next; |
625 | } | 781 | } |
626 | pending_handle(pending_req, i) = | 782 | persistent_gnt->gnt = map[new_map_idx].ref; |
627 | persistent_gnts[i]->handle; | 783 | persistent_gnt->handle = map[new_map_idx].handle; |
784 | persistent_gnt->page = pages[seg_idx]->page; | ||
785 | if (add_persistent_gnt(blkif, | ||
786 | persistent_gnt)) { | ||
787 | kfree(persistent_gnt); | ||
788 | persistent_gnt = NULL; | ||
789 | goto next; | ||
790 | } | ||
791 | pages[seg_idx]->persistent_gnt = persistent_gnt; | ||
792 | pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n", | ||
793 | persistent_gnt->gnt, blkif->persistent_gnt_c, | ||
794 | xen_blkif_max_pgrants); | ||
795 | goto next; | ||
796 | } | ||
797 | if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) { | ||
798 | blkif->vbd.overflow_max_grants = 1; | ||
799 | pr_debug(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n", | ||
800 | blkif->domid, blkif->vbd.handle); | ||
801 | } | ||
802 | /* | ||
803 | * We could not map this grant persistently, so use it as | ||
804 | * a non-persistent grant. | ||
805 | */ | ||
806 | next: | ||
807 | new_map_idx++; | ||
808 | } | ||
809 | segs_to_map = 0; | ||
810 | last_map = map_until; | ||
811 | if (map_until != num) | ||
812 | goto again; | ||
628 | 813 | ||
629 | if (ret) | 814 | return ret; |
630 | continue; | 815 | |
631 | } else { | 816 | out_of_memory: |
632 | pending_handle(pending_req, i) = map[j++].handle; | 817 | pr_alert(DRV_PFX "%s: out of memory\n", __func__); |
633 | bitmap_set(pending_req->unmap_seg, i, 1); | 818 | put_free_pages(blkif, pages_to_gnt, segs_to_map); |
819 | return -ENOMEM; | ||
820 | } | ||
821 | |||
822 | static int xen_blkbk_map_seg(struct pending_req *pending_req) | ||
823 | { | ||
824 | int rc; | ||
825 | |||
826 | rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, | ||
827 | pending_req->nr_pages, | ||
828 | (pending_req->operation != BLKIF_OP_READ)); | ||
829 | |||
830 | return rc; | ||
831 | } | ||
634 | 832 | ||
635 | if (ret) | 833 | static int xen_blkbk_parse_indirect(struct blkif_request *req, |
636 | continue; | 834 | struct pending_req *pending_req, |
835 | struct seg_buf seg[], | ||
836 | struct phys_req *preq) | ||
837 | { | ||
838 | struct grant_page **pages = pending_req->indirect_pages; | ||
839 | struct xen_blkif *blkif = pending_req->blkif; | ||
840 | int indirect_grefs, rc, n, nseg, i; | ||
841 | struct blkif_request_segment_aligned *segments = NULL; | ||
842 | |||
843 | nseg = pending_req->nr_pages; | ||
844 | indirect_grefs = INDIRECT_PAGES(nseg); | ||
845 | BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); | ||
846 | |||
847 | for (i = 0; i < indirect_grefs; i++) | ||
848 | pages[i]->gref = req->u.indirect.indirect_grefs[i]; | ||
849 | |||
850 | rc = xen_blkbk_map(blkif, pages, indirect_grefs, true); | ||
851 | if (rc) | ||
852 | goto unmap; | ||
853 | |||
854 | for (n = 0, i = 0; n < nseg; n++) { | ||
855 | if ((n % SEGS_PER_INDIRECT_FRAME) == 0) { | ||
856 | /* Map indirect segments */ | ||
857 | if (segments) | ||
858 | kunmap_atomic(segments); | ||
859 | segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page); | ||
860 | } | ||
861 | i = n % SEGS_PER_INDIRECT_FRAME; | ||
862 | pending_req->segments[n]->gref = segments[i].gref; | ||
863 | seg[n].nsec = segments[i].last_sect - | ||
864 | segments[i].first_sect + 1; | ||
865 | seg[n].offset = (segments[i].first_sect << 9); | ||
866 | if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) || | ||
867 | (segments[i].last_sect < segments[i].first_sect)) { | ||
868 | rc = -EINVAL; | ||
869 | goto unmap; | ||
637 | } | 870 | } |
638 | seg[i].offset = (req->u.rw.seg[i].first_sect << 9); | 871 | preq->nr_sects += seg[n].nsec; |
639 | } | 872 | } |
640 | return ret; | 873 | |
874 | unmap: | ||
875 | if (segments) | ||
876 | kunmap_atomic(segments); | ||
877 | xen_blkbk_unmap(blkif, pages, indirect_grefs); | ||
878 | return rc; | ||
641 | } | 879 | } |
642 | 880 | ||
643 | static int dispatch_discard_io(struct xen_blkif *blkif, | 881 | static int dispatch_discard_io(struct xen_blkif *blkif, |
@@ -647,7 +885,18 @@ static int dispatch_discard_io(struct xen_blkif *blkif, | |||
647 | int status = BLKIF_RSP_OKAY; | 885 | int status = BLKIF_RSP_OKAY; |
648 | struct block_device *bdev = blkif->vbd.bdev; | 886 | struct block_device *bdev = blkif->vbd.bdev; |
649 | unsigned long secure; | 887 | unsigned long secure; |
888 | struct phys_req preq; | ||
889 | |||
890 | preq.sector_number = req->u.discard.sector_number; | ||
891 | preq.nr_sects = req->u.discard.nr_sectors; | ||
650 | 892 | ||
893 | err = xen_vbd_translate(&preq, blkif, WRITE); | ||
894 | if (err) { | ||
895 | pr_warn(DRV_PFX "access denied: DISCARD [%llu->%llu] on dev=%04x\n", | ||
896 | preq.sector_number, | ||
897 | preq.sector_number + preq.nr_sects, blkif->vbd.pdevice); | ||
898 | goto fail_response; | ||
899 | } | ||
651 | blkif->st_ds_req++; | 900 | blkif->st_ds_req++; |
652 | 901 | ||
653 | xen_blkif_get(blkif); | 902 | xen_blkif_get(blkif); |
@@ -658,7 +907,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif, | |||
658 | err = blkdev_issue_discard(bdev, req->u.discard.sector_number, | 907 | err = blkdev_issue_discard(bdev, req->u.discard.sector_number, |
659 | req->u.discard.nr_sectors, | 908 | req->u.discard.nr_sectors, |
660 | GFP_KERNEL, secure); | 909 | GFP_KERNEL, secure); |
661 | 910 | fail_response: | |
662 | if (err == -EOPNOTSUPP) { | 911 | if (err == -EOPNOTSUPP) { |
663 | pr_debug(DRV_PFX "discard op failed, not supported\n"); | 912 | pr_debug(DRV_PFX "discard op failed, not supported\n"); |
664 | status = BLKIF_RSP_EOPNOTSUPP; | 913 | status = BLKIF_RSP_EOPNOTSUPP; |
@@ -674,7 +923,7 @@ static int dispatch_other_io(struct xen_blkif *blkif, | |||
674 | struct blkif_request *req, | 923 | struct blkif_request *req, |
675 | struct pending_req *pending_req) | 924 | struct pending_req *pending_req) |
676 | { | 925 | { |
677 | free_req(pending_req); | 926 | free_req(blkif, pending_req); |
678 | make_response(blkif, req->u.other.id, req->operation, | 927 | make_response(blkif, req->u.other.id, req->operation, |
679 | BLKIF_RSP_EOPNOTSUPP); | 928 | BLKIF_RSP_EOPNOTSUPP); |
680 | return -EIO; | 929 | return -EIO; |
@@ -726,7 +975,9 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) | |||
726 | * the proper response on the ring. | 975 | * the proper response on the ring. |
727 | */ | 976 | */ |
728 | if (atomic_dec_and_test(&pending_req->pendcnt)) { | 977 | if (atomic_dec_and_test(&pending_req->pendcnt)) { |
729 | xen_blkbk_unmap(pending_req); | 978 | xen_blkbk_unmap(pending_req->blkif, |
979 | pending_req->segments, | ||
980 | pending_req->nr_pages); | ||
730 | make_response(pending_req->blkif, pending_req->id, | 981 | make_response(pending_req->blkif, pending_req->id, |
731 | pending_req->operation, pending_req->status); | 982 | pending_req->operation, pending_req->status); |
732 | xen_blkif_put(pending_req->blkif); | 983 | xen_blkif_put(pending_req->blkif); |
@@ -734,7 +985,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) | |||
734 | if (atomic_read(&pending_req->blkif->drain)) | 985 | if (atomic_read(&pending_req->blkif->drain)) |
735 | complete(&pending_req->blkif->drain_complete); | 986 | complete(&pending_req->blkif->drain_complete); |
736 | } | 987 | } |
737 | free_req(pending_req); | 988 | free_req(pending_req->blkif, pending_req); |
738 | } | 989 | } |
739 | } | 990 | } |
740 | 991 | ||
@@ -767,6 +1018,12 @@ __do_block_io_op(struct xen_blkif *blkif) | |||
767 | rp = blk_rings->common.sring->req_prod; | 1018 | rp = blk_rings->common.sring->req_prod; |
768 | rmb(); /* Ensure we see queued requests up to 'rp'. */ | 1019 | rmb(); /* Ensure we see queued requests up to 'rp'. */ |
769 | 1020 | ||
1021 | if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) { | ||
1022 | rc = blk_rings->common.rsp_prod_pvt; | ||
1023 | pr_warn(DRV_PFX "Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n", | ||
1024 | rp, rc, rp - rc, blkif->vbd.pdevice); | ||
1025 | return -EACCES; | ||
1026 | } | ||
770 | while (rc != rp) { | 1027 | while (rc != rp) { |
771 | 1028 | ||
772 | if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) | 1029 | if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) |
@@ -777,7 +1034,7 @@ __do_block_io_op(struct xen_blkif *blkif) | |||
777 | break; | 1034 | break; |
778 | } | 1035 | } |
779 | 1036 | ||
780 | pending_req = alloc_req(); | 1037 | pending_req = alloc_req(blkif); |
781 | if (NULL == pending_req) { | 1038 | if (NULL == pending_req) { |
782 | blkif->st_oo_req++; | 1039 | blkif->st_oo_req++; |
783 | more_to_do = 1; | 1040 | more_to_do = 1; |
@@ -807,11 +1064,12 @@ __do_block_io_op(struct xen_blkif *blkif) | |||
807 | case BLKIF_OP_WRITE: | 1064 | case BLKIF_OP_WRITE: |
808 | case BLKIF_OP_WRITE_BARRIER: | 1065 | case BLKIF_OP_WRITE_BARRIER: |
809 | case BLKIF_OP_FLUSH_DISKCACHE: | 1066 | case BLKIF_OP_FLUSH_DISKCACHE: |
1067 | case BLKIF_OP_INDIRECT: | ||
810 | if (dispatch_rw_block_io(blkif, &req, pending_req)) | 1068 | if (dispatch_rw_block_io(blkif, &req, pending_req)) |
811 | goto done; | 1069 | goto done; |
812 | break; | 1070 | break; |
813 | case BLKIF_OP_DISCARD: | 1071 | case BLKIF_OP_DISCARD: |
814 | free_req(pending_req); | 1072 | free_req(blkif, pending_req); |
815 | if (dispatch_discard_io(blkif, &req)) | 1073 | if (dispatch_discard_io(blkif, &req)) |
816 | goto done; | 1074 | goto done; |
817 | break; | 1075 | break; |
@@ -853,17 +1111,28 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
853 | struct pending_req *pending_req) | 1111 | struct pending_req *pending_req) |
854 | { | 1112 | { |
855 | struct phys_req preq; | 1113 | struct phys_req preq; |
856 | struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 1114 | struct seg_buf *seg = pending_req->seg; |
857 | unsigned int nseg; | 1115 | unsigned int nseg; |
858 | struct bio *bio = NULL; | 1116 | struct bio *bio = NULL; |
859 | struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 1117 | struct bio **biolist = pending_req->biolist; |
860 | int i, nbio = 0; | 1118 | int i, nbio = 0; |
861 | int operation; | 1119 | int operation; |
862 | struct blk_plug plug; | 1120 | struct blk_plug plug; |
863 | bool drain = false; | 1121 | bool drain = false; |
864 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 1122 | struct grant_page **pages = pending_req->segments; |
1123 | unsigned short req_operation; | ||
1124 | |||
1125 | req_operation = req->operation == BLKIF_OP_INDIRECT ? | ||
1126 | req->u.indirect.indirect_op : req->operation; | ||
1127 | if ((req->operation == BLKIF_OP_INDIRECT) && | ||
1128 | (req_operation != BLKIF_OP_READ) && | ||
1129 | (req_operation != BLKIF_OP_WRITE)) { | ||
1130 | pr_debug(DRV_PFX "Invalid indirect operation (%u)\n", | ||
1131 | req_operation); | ||
1132 | goto fail_response; | ||
1133 | } | ||
865 | 1134 | ||
866 | switch (req->operation) { | 1135 | switch (req_operation) { |
867 | case BLKIF_OP_READ: | 1136 | case BLKIF_OP_READ: |
868 | blkif->st_rd_req++; | 1137 | blkif->st_rd_req++; |
869 | operation = READ; | 1138 | operation = READ; |
@@ -885,33 +1154,47 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
885 | } | 1154 | } |
886 | 1155 | ||
887 | /* Check that the number of segments is sane. */ | 1156 | /* Check that the number of segments is sane. */ |
888 | nseg = req->u.rw.nr_segments; | 1157 | nseg = req->operation == BLKIF_OP_INDIRECT ? |
1158 | req->u.indirect.nr_segments : req->u.rw.nr_segments; | ||
889 | 1159 | ||
890 | if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || | 1160 | if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || |
891 | unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { | 1161 | unlikely((req->operation != BLKIF_OP_INDIRECT) && |
1162 | (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) || | ||
1163 | unlikely((req->operation == BLKIF_OP_INDIRECT) && | ||
1164 | (nseg > MAX_INDIRECT_SEGMENTS))) { | ||
892 | pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", | 1165 | pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", |
893 | nseg); | 1166 | nseg); |
894 | /* Haven't submitted any bio's yet. */ | 1167 | /* Haven't submitted any bio's yet. */ |
895 | goto fail_response; | 1168 | goto fail_response; |
896 | } | 1169 | } |
897 | 1170 | ||
898 | preq.sector_number = req->u.rw.sector_number; | ||
899 | preq.nr_sects = 0; | 1171 | preq.nr_sects = 0; |
900 | 1172 | ||
901 | pending_req->blkif = blkif; | 1173 | pending_req->blkif = blkif; |
902 | pending_req->id = req->u.rw.id; | 1174 | pending_req->id = req->u.rw.id; |
903 | pending_req->operation = req->operation; | 1175 | pending_req->operation = req_operation; |
904 | pending_req->status = BLKIF_RSP_OKAY; | 1176 | pending_req->status = BLKIF_RSP_OKAY; |
905 | pending_req->nr_pages = nseg; | 1177 | pending_req->nr_pages = nseg; |
906 | 1178 | ||
907 | for (i = 0; i < nseg; i++) { | 1179 | if (req->operation != BLKIF_OP_INDIRECT) { |
908 | seg[i].nsec = req->u.rw.seg[i].last_sect - | 1180 | preq.dev = req->u.rw.handle; |
909 | req->u.rw.seg[i].first_sect + 1; | 1181 | preq.sector_number = req->u.rw.sector_number; |
910 | if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || | 1182 | for (i = 0; i < nseg; i++) { |
911 | (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect)) | 1183 | pages[i]->gref = req->u.rw.seg[i].gref; |
1184 | seg[i].nsec = req->u.rw.seg[i].last_sect - | ||
1185 | req->u.rw.seg[i].first_sect + 1; | ||
1186 | seg[i].offset = (req->u.rw.seg[i].first_sect << 9); | ||
1187 | if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || | ||
1188 | (req->u.rw.seg[i].last_sect < | ||
1189 | req->u.rw.seg[i].first_sect)) | ||
1190 | goto fail_response; | ||
1191 | preq.nr_sects += seg[i].nsec; | ||
1192 | } | ||
1193 | } else { | ||
1194 | preq.dev = req->u.indirect.handle; | ||
1195 | preq.sector_number = req->u.indirect.sector_number; | ||
1196 | if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq)) | ||
912 | goto fail_response; | 1197 | goto fail_response; |
913 | preq.nr_sects += seg[i].nsec; | ||
914 | |||
915 | } | 1198 | } |
916 | 1199 | ||
917 | if (xen_vbd_translate(&preq, blkif, operation) != 0) { | 1200 | if (xen_vbd_translate(&preq, blkif, operation) != 0) { |
@@ -948,7 +1231,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
948 | * the hypercall to unmap the grants - that is all done in | 1231 | * the hypercall to unmap the grants - that is all done in |
949 | * xen_blkbk_unmap. | 1232 | * xen_blkbk_unmap. |
950 | */ | 1233 | */ |
951 | if (xen_blkbk_map(req, pending_req, seg, pages)) | 1234 | if (xen_blkbk_map_seg(pending_req)) |
952 | goto fail_flush; | 1235 | goto fail_flush; |
953 | 1236 | ||
954 | /* | 1237 | /* |
@@ -960,11 +1243,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
960 | for (i = 0; i < nseg; i++) { | 1243 | for (i = 0; i < nseg; i++) { |
961 | while ((bio == NULL) || | 1244 | while ((bio == NULL) || |
962 | (bio_add_page(bio, | 1245 | (bio_add_page(bio, |
963 | pages[i], | 1246 | pages[i]->page, |
964 | seg[i].nsec << 9, | 1247 | seg[i].nsec << 9, |
965 | seg[i].offset) == 0)) { | 1248 | seg[i].offset) == 0)) { |
966 | 1249 | ||
967 | bio = bio_alloc(GFP_KERNEL, nseg-i); | 1250 | int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES); |
1251 | bio = bio_alloc(GFP_KERNEL, nr_iovecs); | ||
968 | if (unlikely(bio == NULL)) | 1252 | if (unlikely(bio == NULL)) |
969 | goto fail_put_bio; | 1253 | goto fail_put_bio; |
970 | 1254 | ||
@@ -1009,11 +1293,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
1009 | return 0; | 1293 | return 0; |
1010 | 1294 | ||
1011 | fail_flush: | 1295 | fail_flush: |
1012 | xen_blkbk_unmap(pending_req); | 1296 | xen_blkbk_unmap(blkif, pending_req->segments, |
1297 | pending_req->nr_pages); | ||
1013 | fail_response: | 1298 | fail_response: |
1014 | /* Haven't submitted any bio's yet. */ | 1299 | /* Haven't submitted any bio's yet. */ |
1015 | make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR); | 1300 | make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); |
1016 | free_req(pending_req); | 1301 | free_req(blkif, pending_req); |
1017 | msleep(1); /* back off a bit */ | 1302 | msleep(1); /* back off a bit */ |
1018 | return -EIO; | 1303 | return -EIO; |
1019 | 1304 | ||
@@ -1070,73 +1355,20 @@ static void make_response(struct xen_blkif *blkif, u64 id, | |||
1070 | 1355 | ||
1071 | static int __init xen_blkif_init(void) | 1356 | static int __init xen_blkif_init(void) |
1072 | { | 1357 | { |
1073 | int i, mmap_pages; | ||
1074 | int rc = 0; | 1358 | int rc = 0; |
1075 | 1359 | ||
1076 | if (!xen_domain()) | 1360 | if (!xen_domain()) |
1077 | return -ENODEV; | 1361 | return -ENODEV; |
1078 | 1362 | ||
1079 | blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL); | ||
1080 | if (!blkbk) { | ||
1081 | pr_alert(DRV_PFX "%s: out of memory!\n", __func__); | ||
1082 | return -ENOMEM; | ||
1083 | } | ||
1084 | |||
1085 | mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
1086 | |||
1087 | blkbk->pending_reqs = kzalloc(sizeof(blkbk->pending_reqs[0]) * | ||
1088 | xen_blkif_reqs, GFP_KERNEL); | ||
1089 | blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) * | ||
1090 | mmap_pages, GFP_KERNEL); | ||
1091 | blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) * | ||
1092 | mmap_pages, GFP_KERNEL); | ||
1093 | |||
1094 | if (!blkbk->pending_reqs || !blkbk->pending_grant_handles || | ||
1095 | !blkbk->pending_pages) { | ||
1096 | rc = -ENOMEM; | ||
1097 | goto out_of_memory; | ||
1098 | } | ||
1099 | |||
1100 | for (i = 0; i < mmap_pages; i++) { | ||
1101 | blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; | ||
1102 | blkbk->pending_pages[i] = alloc_page(GFP_KERNEL); | ||
1103 | if (blkbk->pending_pages[i] == NULL) { | ||
1104 | rc = -ENOMEM; | ||
1105 | goto out_of_memory; | ||
1106 | } | ||
1107 | } | ||
1108 | rc = xen_blkif_interface_init(); | 1363 | rc = xen_blkif_interface_init(); |
1109 | if (rc) | 1364 | if (rc) |
1110 | goto failed_init; | 1365 | goto failed_init; |
1111 | 1366 | ||
1112 | INIT_LIST_HEAD(&blkbk->pending_free); | ||
1113 | spin_lock_init(&blkbk->pending_free_lock); | ||
1114 | init_waitqueue_head(&blkbk->pending_free_wq); | ||
1115 | |||
1116 | for (i = 0; i < xen_blkif_reqs; i++) | ||
1117 | list_add_tail(&blkbk->pending_reqs[i].free_list, | ||
1118 | &blkbk->pending_free); | ||
1119 | |||
1120 | rc = xen_blkif_xenbus_init(); | 1367 | rc = xen_blkif_xenbus_init(); |
1121 | if (rc) | 1368 | if (rc) |
1122 | goto failed_init; | 1369 | goto failed_init; |
1123 | 1370 | ||
1124 | return 0; | ||
1125 | |||
1126 | out_of_memory: | ||
1127 | pr_alert(DRV_PFX "%s: out of memory\n", __func__); | ||
1128 | failed_init: | 1371 | failed_init: |
1129 | kfree(blkbk->pending_reqs); | ||
1130 | kfree(blkbk->pending_grant_handles); | ||
1131 | if (blkbk->pending_pages) { | ||
1132 | for (i = 0; i < mmap_pages; i++) { | ||
1133 | if (blkbk->pending_pages[i]) | ||
1134 | __free_page(blkbk->pending_pages[i]); | ||
1135 | } | ||
1136 | kfree(blkbk->pending_pages); | ||
1137 | } | ||
1138 | kfree(blkbk); | ||
1139 | blkbk = NULL; | ||
1140 | return rc; | 1372 | return rc; |
1141 | } | 1373 | } |
1142 | 1374 | ||
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 60103e2517ba..8d8807563d99 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h | |||
@@ -50,6 +50,19 @@ | |||
50 | __func__, __LINE__, ##args) | 50 | __func__, __LINE__, ##args) |
51 | 51 | ||
52 | 52 | ||
53 | /* | ||
54 | * This is the maximum number of segments that would be allowed in indirect | ||
55 | * requests. This value will also be passed to the frontend. | ||
56 | */ | ||
57 | #define MAX_INDIRECT_SEGMENTS 256 | ||
58 | |||
59 | #define SEGS_PER_INDIRECT_FRAME \ | ||
60 | (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned)) | ||
61 | #define MAX_INDIRECT_PAGES \ | ||
62 | ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) | ||
63 | #define INDIRECT_PAGES(_segs) \ | ||
64 | ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) | ||
65 | |||
53 | /* Not a real protocol. Used to generate ring structs which contain | 66 | /* Not a real protocol. Used to generate ring structs which contain |
54 | * the elements common to all protocols only. This way we get a | 67 | * the elements common to all protocols only. This way we get a |
55 | * compiler-checkable way to use common struct elements, so we can | 68 | * compiler-checkable way to use common struct elements, so we can |
@@ -83,12 +96,31 @@ struct blkif_x86_32_request_other { | |||
83 | uint64_t id; /* private guest value, echoed in resp */ | 96 | uint64_t id; /* private guest value, echoed in resp */ |
84 | } __attribute__((__packed__)); | 97 | } __attribute__((__packed__)); |
85 | 98 | ||
99 | struct blkif_x86_32_request_indirect { | ||
100 | uint8_t indirect_op; | ||
101 | uint16_t nr_segments; | ||
102 | uint64_t id; | ||
103 | blkif_sector_t sector_number; | ||
104 | blkif_vdev_t handle; | ||
105 | uint16_t _pad1; | ||
106 | grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; | ||
107 | /* | ||
108 | * The maximum number of indirect segments (and pages) that will | ||
109 | * be used is determined by MAX_INDIRECT_SEGMENTS, this value | ||
110 | * is also exported to the guest (via xenstore | ||
111 | * feature-max-indirect-segments entry), so the frontend knows how | ||
112 | * many indirect segments the backend supports. | ||
113 | */ | ||
114 | uint64_t _pad2; /* make it 64 byte aligned */ | ||
115 | } __attribute__((__packed__)); | ||
116 | |||
86 | struct blkif_x86_32_request { | 117 | struct blkif_x86_32_request { |
87 | uint8_t operation; /* BLKIF_OP_??? */ | 118 | uint8_t operation; /* BLKIF_OP_??? */ |
88 | union { | 119 | union { |
89 | struct blkif_x86_32_request_rw rw; | 120 | struct blkif_x86_32_request_rw rw; |
90 | struct blkif_x86_32_request_discard discard; | 121 | struct blkif_x86_32_request_discard discard; |
91 | struct blkif_x86_32_request_other other; | 122 | struct blkif_x86_32_request_other other; |
123 | struct blkif_x86_32_request_indirect indirect; | ||
92 | } u; | 124 | } u; |
93 | } __attribute__((__packed__)); | 125 | } __attribute__((__packed__)); |
94 | 126 | ||
@@ -127,12 +159,32 @@ struct blkif_x86_64_request_other { | |||
127 | uint64_t id; /* private guest value, echoed in resp */ | 159 | uint64_t id; /* private guest value, echoed in resp */ |
128 | } __attribute__((__packed__)); | 160 | } __attribute__((__packed__)); |
129 | 161 | ||
162 | struct blkif_x86_64_request_indirect { | ||
163 | uint8_t indirect_op; | ||
164 | uint16_t nr_segments; | ||
165 | uint32_t _pad1; /* offsetof(blkif_..,u.indirect.id)==8 */ | ||
166 | uint64_t id; | ||
167 | blkif_sector_t sector_number; | ||
168 | blkif_vdev_t handle; | ||
169 | uint16_t _pad2; | ||
170 | grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; | ||
171 | /* | ||
172 | * The maximum number of indirect segments (and pages) that will | ||
173 | * be used is determined by MAX_INDIRECT_SEGMENTS, this value | ||
174 | * is also exported to the guest (via xenstore | ||
175 | * feature-max-indirect-segments entry), so the frontend knows how | ||
176 | * many indirect segments the backend supports. | ||
177 | */ | ||
178 | uint32_t _pad3; /* make it 64 byte aligned */ | ||
179 | } __attribute__((__packed__)); | ||
180 | |||
130 | struct blkif_x86_64_request { | 181 | struct blkif_x86_64_request { |
131 | uint8_t operation; /* BLKIF_OP_??? */ | 182 | uint8_t operation; /* BLKIF_OP_??? */ |
132 | union { | 183 | union { |
133 | struct blkif_x86_64_request_rw rw; | 184 | struct blkif_x86_64_request_rw rw; |
134 | struct blkif_x86_64_request_discard discard; | 185 | struct blkif_x86_64_request_discard discard; |
135 | struct blkif_x86_64_request_other other; | 186 | struct blkif_x86_64_request_other other; |
187 | struct blkif_x86_64_request_indirect indirect; | ||
136 | } u; | 188 | } u; |
137 | } __attribute__((__packed__)); | 189 | } __attribute__((__packed__)); |
138 | 190 | ||
@@ -182,12 +234,26 @@ struct xen_vbd { | |||
182 | 234 | ||
183 | struct backend_info; | 235 | struct backend_info; |
184 | 236 | ||
237 | /* Number of available flags */ | ||
238 | #define PERSISTENT_GNT_FLAGS_SIZE 2 | ||
239 | /* This persistent grant is currently in use */ | ||
240 | #define PERSISTENT_GNT_ACTIVE 0 | ||
241 | /* | ||
242 | * This persistent grant has been used, this flag is set when we remove the | ||
243 | * PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently. | ||
244 | */ | ||
245 | #define PERSISTENT_GNT_WAS_ACTIVE 1 | ||
246 | |||
247 | /* Number of requests that we can fit in a ring */ | ||
248 | #define XEN_BLKIF_REQS 32 | ||
185 | 249 | ||
186 | struct persistent_gnt { | 250 | struct persistent_gnt { |
187 | struct page *page; | 251 | struct page *page; |
188 | grant_ref_t gnt; | 252 | grant_ref_t gnt; |
189 | grant_handle_t handle; | 253 | grant_handle_t handle; |
254 | DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE); | ||
190 | struct rb_node node; | 255 | struct rb_node node; |
256 | struct list_head remove_node; | ||
191 | }; | 257 | }; |
192 | 258 | ||
193 | struct xen_blkif { | 259 | struct xen_blkif { |
@@ -219,6 +285,23 @@ struct xen_blkif { | |||
219 | /* tree to store persistent grants */ | 285 | /* tree to store persistent grants */ |
220 | struct rb_root persistent_gnts; | 286 | struct rb_root persistent_gnts; |
221 | unsigned int persistent_gnt_c; | 287 | unsigned int persistent_gnt_c; |
288 | atomic_t persistent_gnt_in_use; | ||
289 | unsigned long next_lru; | ||
290 | |||
291 | /* used by the kworker that offload work from the persistent purge */ | ||
292 | struct list_head persistent_purge_list; | ||
293 | struct work_struct persistent_purge_work; | ||
294 | |||
295 | /* buffer of free pages to map grant refs */ | ||
296 | spinlock_t free_pages_lock; | ||
297 | int free_pages_num; | ||
298 | struct list_head free_pages; | ||
299 | |||
300 | /* List of all 'pending_req' available */ | ||
301 | struct list_head pending_free; | ||
302 | /* And its spinlock. */ | ||
303 | spinlock_t pending_free_lock; | ||
304 | wait_queue_head_t pending_free_wq; | ||
222 | 305 | ||
223 | /* statistics */ | 306 | /* statistics */ |
224 | unsigned long st_print; | 307 | unsigned long st_print; |
@@ -231,6 +314,41 @@ struct xen_blkif { | |||
231 | unsigned long long st_wr_sect; | 314 | unsigned long long st_wr_sect; |
232 | 315 | ||
233 | wait_queue_head_t waiting_to_free; | 316 | wait_queue_head_t waiting_to_free; |
317 | /* Thread shutdown wait queue. */ | ||
318 | wait_queue_head_t shutdown_wq; | ||
319 | }; | ||
320 | |||
321 | struct seg_buf { | ||
322 | unsigned long offset; | ||
323 | unsigned int nsec; | ||
324 | }; | ||
325 | |||
326 | struct grant_page { | ||
327 | struct page *page; | ||
328 | struct persistent_gnt *persistent_gnt; | ||
329 | grant_handle_t handle; | ||
330 | grant_ref_t gref; | ||
331 | }; | ||
332 | |||
333 | /* | ||
334 | * Each outstanding request that we've passed to the lower device layers has a | ||
335 | * 'pending_req' allocated to it. Each buffer_head that completes decrements | ||
336 | * the pendcnt towards zero. When it hits zero, the specified domain has a | ||
337 | * response queued for it, with the saved 'id' passed back. | ||
338 | */ | ||
339 | struct pending_req { | ||
340 | struct xen_blkif *blkif; | ||
341 | u64 id; | ||
342 | int nr_pages; | ||
343 | atomic_t pendcnt; | ||
344 | unsigned short operation; | ||
345 | int status; | ||
346 | struct list_head free_list; | ||
347 | struct grant_page *segments[MAX_INDIRECT_SEGMENTS]; | ||
348 | /* Indirect descriptors */ | ||
349 | struct grant_page *indirect_pages[MAX_INDIRECT_PAGES]; | ||
350 | struct seg_buf seg[MAX_INDIRECT_SEGMENTS]; | ||
351 | struct bio *biolist[MAX_INDIRECT_SEGMENTS]; | ||
234 | }; | 352 | }; |
235 | 353 | ||
236 | 354 | ||
@@ -257,6 +375,7 @@ int xen_blkif_xenbus_init(void); | |||
257 | 375 | ||
258 | irqreturn_t xen_blkif_be_int(int irq, void *dev_id); | 376 | irqreturn_t xen_blkif_be_int(int irq, void *dev_id); |
259 | int xen_blkif_schedule(void *arg); | 377 | int xen_blkif_schedule(void *arg); |
378 | int xen_blkif_purge_persistent(void *arg); | ||
260 | 379 | ||
261 | int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, | 380 | int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, |
262 | struct backend_info *be, int state); | 381 | struct backend_info *be, int state); |
@@ -268,7 +387,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be); | |||
268 | static inline void blkif_get_x86_32_req(struct blkif_request *dst, | 387 | static inline void blkif_get_x86_32_req(struct blkif_request *dst, |
269 | struct blkif_x86_32_request *src) | 388 | struct blkif_x86_32_request *src) |
270 | { | 389 | { |
271 | int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; | 390 | int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; |
272 | dst->operation = src->operation; | 391 | dst->operation = src->operation; |
273 | switch (src->operation) { | 392 | switch (src->operation) { |
274 | case BLKIF_OP_READ: | 393 | case BLKIF_OP_READ: |
@@ -291,6 +410,18 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, | |||
291 | dst->u.discard.sector_number = src->u.discard.sector_number; | 410 | dst->u.discard.sector_number = src->u.discard.sector_number; |
292 | dst->u.discard.nr_sectors = src->u.discard.nr_sectors; | 411 | dst->u.discard.nr_sectors = src->u.discard.nr_sectors; |
293 | break; | 412 | break; |
413 | case BLKIF_OP_INDIRECT: | ||
414 | dst->u.indirect.indirect_op = src->u.indirect.indirect_op; | ||
415 | dst->u.indirect.nr_segments = src->u.indirect.nr_segments; | ||
416 | dst->u.indirect.handle = src->u.indirect.handle; | ||
417 | dst->u.indirect.id = src->u.indirect.id; | ||
418 | dst->u.indirect.sector_number = src->u.indirect.sector_number; | ||
419 | barrier(); | ||
420 | j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments)); | ||
421 | for (i = 0; i < j; i++) | ||
422 | dst->u.indirect.indirect_grefs[i] = | ||
423 | src->u.indirect.indirect_grefs[i]; | ||
424 | break; | ||
294 | default: | 425 | default: |
295 | /* | 426 | /* |
296 | * Don't know how to translate this op. Only get the | 427 | * Don't know how to translate this op. Only get the |
@@ -304,7 +435,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, | |||
304 | static inline void blkif_get_x86_64_req(struct blkif_request *dst, | 435 | static inline void blkif_get_x86_64_req(struct blkif_request *dst, |
305 | struct blkif_x86_64_request *src) | 436 | struct blkif_x86_64_request *src) |
306 | { | 437 | { |
307 | int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; | 438 | int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; |
308 | dst->operation = src->operation; | 439 | dst->operation = src->operation; |
309 | switch (src->operation) { | 440 | switch (src->operation) { |
310 | case BLKIF_OP_READ: | 441 | case BLKIF_OP_READ: |
@@ -327,6 +458,18 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst, | |||
327 | dst->u.discard.sector_number = src->u.discard.sector_number; | 458 | dst->u.discard.sector_number = src->u.discard.sector_number; |
328 | dst->u.discard.nr_sectors = src->u.discard.nr_sectors; | 459 | dst->u.discard.nr_sectors = src->u.discard.nr_sectors; |
329 | break; | 460 | break; |
461 | case BLKIF_OP_INDIRECT: | ||
462 | dst->u.indirect.indirect_op = src->u.indirect.indirect_op; | ||
463 | dst->u.indirect.nr_segments = src->u.indirect.nr_segments; | ||
464 | dst->u.indirect.handle = src->u.indirect.handle; | ||
465 | dst->u.indirect.id = src->u.indirect.id; | ||
466 | dst->u.indirect.sector_number = src->u.indirect.sector_number; | ||
467 | barrier(); | ||
468 | j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments)); | ||
469 | for (i = 0; i < j; i++) | ||
470 | dst->u.indirect.indirect_grefs[i] = | ||
471 | src->u.indirect.indirect_grefs[i]; | ||
472 | break; | ||
330 | default: | 473 | default: |
331 | /* | 474 | /* |
332 | * Don't know how to translate this op. Only get the | 475 | * Don't know how to translate this op. Only get the |
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 04608a6502d7..fe5c3cd10c34 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c | |||
@@ -98,12 +98,17 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) | |||
98 | err = PTR_ERR(blkif->xenblkd); | 98 | err = PTR_ERR(blkif->xenblkd); |
99 | blkif->xenblkd = NULL; | 99 | blkif->xenblkd = NULL; |
100 | xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); | 100 | xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); |
101 | return; | ||
101 | } | 102 | } |
102 | } | 103 | } |
103 | 104 | ||
104 | static struct xen_blkif *xen_blkif_alloc(domid_t domid) | 105 | static struct xen_blkif *xen_blkif_alloc(domid_t domid) |
105 | { | 106 | { |
106 | struct xen_blkif *blkif; | 107 | struct xen_blkif *blkif; |
108 | struct pending_req *req, *n; | ||
109 | int i, j; | ||
110 | |||
111 | BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); | ||
107 | 112 | ||
108 | blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL); | 113 | blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL); |
109 | if (!blkif) | 114 | if (!blkif) |
@@ -118,8 +123,57 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) | |||
118 | blkif->st_print = jiffies; | 123 | blkif->st_print = jiffies; |
119 | init_waitqueue_head(&blkif->waiting_to_free); | 124 | init_waitqueue_head(&blkif->waiting_to_free); |
120 | blkif->persistent_gnts.rb_node = NULL; | 125 | blkif->persistent_gnts.rb_node = NULL; |
126 | spin_lock_init(&blkif->free_pages_lock); | ||
127 | INIT_LIST_HEAD(&blkif->free_pages); | ||
128 | blkif->free_pages_num = 0; | ||
129 | atomic_set(&blkif->persistent_gnt_in_use, 0); | ||
130 | |||
131 | INIT_LIST_HEAD(&blkif->pending_free); | ||
132 | |||
133 | for (i = 0; i < XEN_BLKIF_REQS; i++) { | ||
134 | req = kzalloc(sizeof(*req), GFP_KERNEL); | ||
135 | if (!req) | ||
136 | goto fail; | ||
137 | list_add_tail(&req->free_list, | ||
138 | &blkif->pending_free); | ||
139 | for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { | ||
140 | req->segments[j] = kzalloc(sizeof(*req->segments[0]), | ||
141 | GFP_KERNEL); | ||
142 | if (!req->segments[j]) | ||
143 | goto fail; | ||
144 | } | ||
145 | for (j = 0; j < MAX_INDIRECT_PAGES; j++) { | ||
146 | req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]), | ||
147 | GFP_KERNEL); | ||
148 | if (!req->indirect_pages[j]) | ||
149 | goto fail; | ||
150 | } | ||
151 | } | ||
152 | spin_lock_init(&blkif->pending_free_lock); | ||
153 | init_waitqueue_head(&blkif->pending_free_wq); | ||
154 | init_waitqueue_head(&blkif->shutdown_wq); | ||
121 | 155 | ||
122 | return blkif; | 156 | return blkif; |
157 | |||
158 | fail: | ||
159 | list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { | ||
160 | list_del(&req->free_list); | ||
161 | for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { | ||
162 | if (!req->segments[j]) | ||
163 | break; | ||
164 | kfree(req->segments[j]); | ||
165 | } | ||
166 | for (j = 0; j < MAX_INDIRECT_PAGES; j++) { | ||
167 | if (!req->indirect_pages[j]) | ||
168 | break; | ||
169 | kfree(req->indirect_pages[j]); | ||
170 | } | ||
171 | kfree(req); | ||
172 | } | ||
173 | |||
174 | kmem_cache_free(xen_blkif_cachep, blkif); | ||
175 | |||
176 | return ERR_PTR(-ENOMEM); | ||
123 | } | 177 | } |
124 | 178 | ||
125 | static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page, | 179 | static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page, |
@@ -178,6 +232,7 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif) | |||
178 | { | 232 | { |
179 | if (blkif->xenblkd) { | 233 | if (blkif->xenblkd) { |
180 | kthread_stop(blkif->xenblkd); | 234 | kthread_stop(blkif->xenblkd); |
235 | wake_up(&blkif->shutdown_wq); | ||
181 | blkif->xenblkd = NULL; | 236 | blkif->xenblkd = NULL; |
182 | } | 237 | } |
183 | 238 | ||
@@ -198,8 +253,28 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif) | |||
198 | 253 | ||
199 | static void xen_blkif_free(struct xen_blkif *blkif) | 254 | static void xen_blkif_free(struct xen_blkif *blkif) |
200 | { | 255 | { |
256 | struct pending_req *req, *n; | ||
257 | int i = 0, j; | ||
258 | |||
201 | if (!atomic_dec_and_test(&blkif->refcnt)) | 259 | if (!atomic_dec_and_test(&blkif->refcnt)) |
202 | BUG(); | 260 | BUG(); |
261 | |||
262 | /* Check that there is no request in use */ | ||
263 | list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { | ||
264 | list_del(&req->free_list); | ||
265 | |||
266 | for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) | ||
267 | kfree(req->segments[j]); | ||
268 | |||
269 | for (j = 0; j < MAX_INDIRECT_PAGES; j++) | ||
270 | kfree(req->indirect_pages[j]); | ||
271 | |||
272 | kfree(req); | ||
273 | i++; | ||
274 | } | ||
275 | |||
276 | WARN_ON(i != XEN_BLKIF_REQS); | ||
277 | |||
203 | kmem_cache_free(xen_blkif_cachep, blkif); | 278 | kmem_cache_free(xen_blkif_cachep, blkif); |
204 | } | 279 | } |
205 | 280 | ||
@@ -678,6 +753,11 @@ again: | |||
678 | dev->nodename); | 753 | dev->nodename); |
679 | goto abort; | 754 | goto abort; |
680 | } | 755 | } |
756 | err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u", | ||
757 | MAX_INDIRECT_SEGMENTS); | ||
758 | if (err) | ||
759 | dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)", | ||
760 | dev->nodename, err); | ||
681 | 761 | ||
682 | err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", | 762 | err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", |
683 | (unsigned long long)vbd_sz(&be->blkif->vbd)); | 763 | (unsigned long long)vbd_sz(&be->blkif->vbd)); |
@@ -704,6 +784,11 @@ again: | |||
704 | dev->nodename); | 784 | dev->nodename); |
705 | goto abort; | 785 | goto abort; |
706 | } | 786 | } |
787 | err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u", | ||
788 | bdev_physical_block_size(be->blkif->vbd.bdev)); | ||
789 | if (err) | ||
790 | xenbus_dev_error(dev, err, "writing %s/physical-sector-size", | ||
791 | dev->nodename); | ||
707 | 792 | ||
708 | err = xenbus_transaction_end(xbt, 0); | 793 | err = xenbus_transaction_end(xbt, 0); |
709 | if (err == -EAGAIN) | 794 | if (err == -EAGAIN) |
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index d89ef86220f4..a4660bbee8a6 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c | |||
@@ -74,12 +74,30 @@ struct grant { | |||
74 | struct blk_shadow { | 74 | struct blk_shadow { |
75 | struct blkif_request req; | 75 | struct blkif_request req; |
76 | struct request *request; | 76 | struct request *request; |
77 | struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 77 | struct grant **grants_used; |
78 | struct grant **indirect_grants; | ||
79 | struct scatterlist *sg; | ||
80 | }; | ||
81 | |||
82 | struct split_bio { | ||
83 | struct bio *bio; | ||
84 | atomic_t pending; | ||
85 | int err; | ||
78 | }; | 86 | }; |
79 | 87 | ||
80 | static DEFINE_MUTEX(blkfront_mutex); | 88 | static DEFINE_MUTEX(blkfront_mutex); |
81 | static const struct block_device_operations xlvbd_block_fops; | 89 | static const struct block_device_operations xlvbd_block_fops; |
82 | 90 | ||
91 | /* | ||
92 | * Maximum number of segments in indirect requests, the actual value used by | ||
93 | * the frontend driver is the minimum of this value and the value provided | ||
94 | * by the backend driver. | ||
95 | */ | ||
96 | |||
97 | static unsigned int xen_blkif_max_segments = 32; | ||
98 | module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); | ||
99 | MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); | ||
100 | |||
83 | #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) | 101 | #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) |
84 | 102 | ||
85 | /* | 103 | /* |
@@ -98,7 +116,6 @@ struct blkfront_info | |||
98 | enum blkif_state connected; | 116 | enum blkif_state connected; |
99 | int ring_ref; | 117 | int ring_ref; |
100 | struct blkif_front_ring ring; | 118 | struct blkif_front_ring ring; |
101 | struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
102 | unsigned int evtchn, irq; | 119 | unsigned int evtchn, irq; |
103 | struct request_queue *rq; | 120 | struct request_queue *rq; |
104 | struct work_struct work; | 121 | struct work_struct work; |
@@ -114,6 +131,7 @@ struct blkfront_info | |||
114 | unsigned int discard_granularity; | 131 | unsigned int discard_granularity; |
115 | unsigned int discard_alignment; | 132 | unsigned int discard_alignment; |
116 | unsigned int feature_persistent:1; | 133 | unsigned int feature_persistent:1; |
134 | unsigned int max_indirect_segments; | ||
117 | int is_ready; | 135 | int is_ready; |
118 | }; | 136 | }; |
119 | 137 | ||
@@ -142,6 +160,13 @@ static DEFINE_SPINLOCK(minor_lock); | |||
142 | 160 | ||
143 | #define DEV_NAME "xvd" /* name in /dev */ | 161 | #define DEV_NAME "xvd" /* name in /dev */ |
144 | 162 | ||
163 | #define SEGS_PER_INDIRECT_FRAME \ | ||
164 | (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned)) | ||
165 | #define INDIRECT_GREFS(_segs) \ | ||
166 | ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) | ||
167 | |||
168 | static int blkfront_setup_indirect(struct blkfront_info *info); | ||
169 | |||
145 | static int get_id_from_freelist(struct blkfront_info *info) | 170 | static int get_id_from_freelist(struct blkfront_info *info) |
146 | { | 171 | { |
147 | unsigned long free = info->shadow_free; | 172 | unsigned long free = info->shadow_free; |
@@ -358,7 +383,8 @@ static int blkif_queue_request(struct request *req) | |||
358 | struct blkif_request *ring_req; | 383 | struct blkif_request *ring_req; |
359 | unsigned long id; | 384 | unsigned long id; |
360 | unsigned int fsect, lsect; | 385 | unsigned int fsect, lsect; |
361 | int i, ref; | 386 | int i, ref, n; |
387 | struct blkif_request_segment_aligned *segments = NULL; | ||
362 | 388 | ||
363 | /* | 389 | /* |
364 | * Used to store if we are able to queue the request by just using | 390 | * Used to store if we are able to queue the request by just using |
@@ -369,21 +395,27 @@ static int blkif_queue_request(struct request *req) | |||
369 | grant_ref_t gref_head; | 395 | grant_ref_t gref_head; |
370 | struct grant *gnt_list_entry = NULL; | 396 | struct grant *gnt_list_entry = NULL; |
371 | struct scatterlist *sg; | 397 | struct scatterlist *sg; |
398 | int nseg, max_grefs; | ||
372 | 399 | ||
373 | if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) | 400 | if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) |
374 | return 1; | 401 | return 1; |
375 | 402 | ||
376 | /* Check if we have enought grants to allocate a requests */ | 403 | max_grefs = info->max_indirect_segments ? |
377 | if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) { | 404 | info->max_indirect_segments + |
405 | INDIRECT_GREFS(info->max_indirect_segments) : | ||
406 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
407 | |||
408 | /* Check if we have enough grants to allocate a requests */ | ||
409 | if (info->persistent_gnts_c < max_grefs) { | ||
378 | new_persistent_gnts = 1; | 410 | new_persistent_gnts = 1; |
379 | if (gnttab_alloc_grant_references( | 411 | if (gnttab_alloc_grant_references( |
380 | BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c, | 412 | max_grefs - info->persistent_gnts_c, |
381 | &gref_head) < 0) { | 413 | &gref_head) < 0) { |
382 | gnttab_request_free_callback( | 414 | gnttab_request_free_callback( |
383 | &info->callback, | 415 | &info->callback, |
384 | blkif_restart_queue_callback, | 416 | blkif_restart_queue_callback, |
385 | info, | 417 | info, |
386 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | 418 | max_grefs); |
387 | return 1; | 419 | return 1; |
388 | } | 420 | } |
389 | } else | 421 | } else |
@@ -394,42 +426,67 @@ static int blkif_queue_request(struct request *req) | |||
394 | id = get_id_from_freelist(info); | 426 | id = get_id_from_freelist(info); |
395 | info->shadow[id].request = req; | 427 | info->shadow[id].request = req; |
396 | 428 | ||
397 | ring_req->u.rw.id = id; | ||
398 | ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); | ||
399 | ring_req->u.rw.handle = info->handle; | ||
400 | |||
401 | ring_req->operation = rq_data_dir(req) ? | ||
402 | BLKIF_OP_WRITE : BLKIF_OP_READ; | ||
403 | |||
404 | if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { | ||
405 | /* | ||
406 | * Ideally we can do an unordered flush-to-disk. In case the | ||
407 | * backend onlysupports barriers, use that. A barrier request | ||
408 | * a superset of FUA, so we can implement it the same | ||
409 | * way. (It's also a FLUSH+FUA, since it is | ||
410 | * guaranteed ordered WRT previous writes.) | ||
411 | */ | ||
412 | ring_req->operation = info->flush_op; | ||
413 | } | ||
414 | |||
415 | if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { | 429 | if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { |
416 | /* id, sector_number and handle are set above. */ | ||
417 | ring_req->operation = BLKIF_OP_DISCARD; | 430 | ring_req->operation = BLKIF_OP_DISCARD; |
418 | ring_req->u.discard.nr_sectors = blk_rq_sectors(req); | 431 | ring_req->u.discard.nr_sectors = blk_rq_sectors(req); |
432 | ring_req->u.discard.id = id; | ||
433 | ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); | ||
419 | if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) | 434 | if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) |
420 | ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; | 435 | ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; |
421 | else | 436 | else |
422 | ring_req->u.discard.flag = 0; | 437 | ring_req->u.discard.flag = 0; |
423 | } else { | 438 | } else { |
424 | ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req, | 439 | BUG_ON(info->max_indirect_segments == 0 && |
425 | info->sg); | 440 | req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); |
426 | BUG_ON(ring_req->u.rw.nr_segments > | 441 | BUG_ON(info->max_indirect_segments && |
427 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | 442 | req->nr_phys_segments > info->max_indirect_segments); |
428 | 443 | nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); | |
429 | for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { | 444 | ring_req->u.rw.id = id; |
445 | if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) { | ||
446 | /* | ||
447 | * The indirect operation can only be a BLKIF_OP_READ or | ||
448 | * BLKIF_OP_WRITE | ||
449 | */ | ||
450 | BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); | ||
451 | ring_req->operation = BLKIF_OP_INDIRECT; | ||
452 | ring_req->u.indirect.indirect_op = rq_data_dir(req) ? | ||
453 | BLKIF_OP_WRITE : BLKIF_OP_READ; | ||
454 | ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req); | ||
455 | ring_req->u.indirect.handle = info->handle; | ||
456 | ring_req->u.indirect.nr_segments = nseg; | ||
457 | } else { | ||
458 | ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); | ||
459 | ring_req->u.rw.handle = info->handle; | ||
460 | ring_req->operation = rq_data_dir(req) ? | ||
461 | BLKIF_OP_WRITE : BLKIF_OP_READ; | ||
462 | if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { | ||
463 | /* | ||
464 | * Ideally we can do an unordered flush-to-disk. In case the | ||
465 | * backend onlysupports barriers, use that. A barrier request | ||
466 | * a superset of FUA, so we can implement it the same | ||
467 | * way. (It's also a FLUSH+FUA, since it is | ||
468 | * guaranteed ordered WRT previous writes.) | ||
469 | */ | ||
470 | ring_req->operation = info->flush_op; | ||
471 | } | ||
472 | ring_req->u.rw.nr_segments = nseg; | ||
473 | } | ||
474 | for_each_sg(info->shadow[id].sg, sg, nseg, i) { | ||
430 | fsect = sg->offset >> 9; | 475 | fsect = sg->offset >> 9; |
431 | lsect = fsect + (sg->length >> 9) - 1; | 476 | lsect = fsect + (sg->length >> 9) - 1; |
432 | 477 | ||
478 | if ((ring_req->operation == BLKIF_OP_INDIRECT) && | ||
479 | (i % SEGS_PER_INDIRECT_FRAME == 0)) { | ||
480 | if (segments) | ||
481 | kunmap_atomic(segments); | ||
482 | |||
483 | n = i / SEGS_PER_INDIRECT_FRAME; | ||
484 | gnt_list_entry = get_grant(&gref_head, info); | ||
485 | info->shadow[id].indirect_grants[n] = gnt_list_entry; | ||
486 | segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); | ||
487 | ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; | ||
488 | } | ||
489 | |||
433 | gnt_list_entry = get_grant(&gref_head, info); | 490 | gnt_list_entry = get_grant(&gref_head, info); |
434 | ref = gnt_list_entry->gref; | 491 | ref = gnt_list_entry->gref; |
435 | 492 | ||
@@ -441,8 +498,7 @@ static int blkif_queue_request(struct request *req) | |||
441 | 498 | ||
442 | BUG_ON(sg->offset + sg->length > PAGE_SIZE); | 499 | BUG_ON(sg->offset + sg->length > PAGE_SIZE); |
443 | 500 | ||
444 | shared_data = kmap_atomic( | 501 | shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); |
445 | pfn_to_page(gnt_list_entry->pfn)); | ||
446 | bvec_data = kmap_atomic(sg_page(sg)); | 502 | bvec_data = kmap_atomic(sg_page(sg)); |
447 | 503 | ||
448 | /* | 504 | /* |
@@ -461,13 +517,23 @@ static int blkif_queue_request(struct request *req) | |||
461 | kunmap_atomic(bvec_data); | 517 | kunmap_atomic(bvec_data); |
462 | kunmap_atomic(shared_data); | 518 | kunmap_atomic(shared_data); |
463 | } | 519 | } |
464 | 520 | if (ring_req->operation != BLKIF_OP_INDIRECT) { | |
465 | ring_req->u.rw.seg[i] = | 521 | ring_req->u.rw.seg[i] = |
466 | (struct blkif_request_segment) { | 522 | (struct blkif_request_segment) { |
467 | .gref = ref, | 523 | .gref = ref, |
468 | .first_sect = fsect, | 524 | .first_sect = fsect, |
469 | .last_sect = lsect }; | 525 | .last_sect = lsect }; |
526 | } else { | ||
527 | n = i % SEGS_PER_INDIRECT_FRAME; | ||
528 | segments[n] = | ||
529 | (struct blkif_request_segment_aligned) { | ||
530 | .gref = ref, | ||
531 | .first_sect = fsect, | ||
532 | .last_sect = lsect }; | ||
533 | } | ||
470 | } | 534 | } |
535 | if (segments) | ||
536 | kunmap_atomic(segments); | ||
471 | } | 537 | } |
472 | 538 | ||
473 | info->ring.req_prod_pvt++; | 539 | info->ring.req_prod_pvt++; |
@@ -542,7 +608,9 @@ wait: | |||
542 | flush_requests(info); | 608 | flush_requests(info); |
543 | } | 609 | } |
544 | 610 | ||
545 | static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) | 611 | static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, |
612 | unsigned int physical_sector_size, | ||
613 | unsigned int segments) | ||
546 | { | 614 | { |
547 | struct request_queue *rq; | 615 | struct request_queue *rq; |
548 | struct blkfront_info *info = gd->private_data; | 616 | struct blkfront_info *info = gd->private_data; |
@@ -564,14 +632,15 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) | |||
564 | 632 | ||
565 | /* Hard sector size and max sectors impersonate the equiv. hardware. */ | 633 | /* Hard sector size and max sectors impersonate the equiv. hardware. */ |
566 | blk_queue_logical_block_size(rq, sector_size); | 634 | blk_queue_logical_block_size(rq, sector_size); |
567 | blk_queue_max_hw_sectors(rq, 512); | 635 | blk_queue_physical_block_size(rq, physical_sector_size); |
636 | blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512); | ||
568 | 637 | ||
569 | /* Each segment in a request is up to an aligned page in size. */ | 638 | /* Each segment in a request is up to an aligned page in size. */ |
570 | blk_queue_segment_boundary(rq, PAGE_SIZE - 1); | 639 | blk_queue_segment_boundary(rq, PAGE_SIZE - 1); |
571 | blk_queue_max_segment_size(rq, PAGE_SIZE); | 640 | blk_queue_max_segment_size(rq, PAGE_SIZE); |
572 | 641 | ||
573 | /* Ensure a merged request will fit in a single I/O ring slot. */ | 642 | /* Ensure a merged request will fit in a single I/O ring slot. */ |
574 | blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); | 643 | blk_queue_max_segments(rq, segments); |
575 | 644 | ||
576 | /* Make sure buffer addresses are sector-aligned. */ | 645 | /* Make sure buffer addresses are sector-aligned. */ |
577 | blk_queue_dma_alignment(rq, 511); | 646 | blk_queue_dma_alignment(rq, 511); |
@@ -588,13 +657,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) | |||
588 | static void xlvbd_flush(struct blkfront_info *info) | 657 | static void xlvbd_flush(struct blkfront_info *info) |
589 | { | 658 | { |
590 | blk_queue_flush(info->rq, info->feature_flush); | 659 | blk_queue_flush(info->rq, info->feature_flush); |
591 | printk(KERN_INFO "blkfront: %s: %s: %s %s\n", | 660 | printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n", |
592 | info->gd->disk_name, | 661 | info->gd->disk_name, |
593 | info->flush_op == BLKIF_OP_WRITE_BARRIER ? | 662 | info->flush_op == BLKIF_OP_WRITE_BARRIER ? |
594 | "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? | 663 | "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? |
595 | "flush diskcache" : "barrier or flush"), | 664 | "flush diskcache" : "barrier or flush"), |
596 | info->feature_flush ? "enabled" : "disabled", | 665 | info->feature_flush ? "enabled;" : "disabled;", |
597 | info->feature_persistent ? "using persistent grants" : ""); | 666 | "persistent grants:", |
667 | info->feature_persistent ? "enabled;" : "disabled;", | ||
668 | "indirect descriptors:", | ||
669 | info->max_indirect_segments ? "enabled;" : "disabled;"); | ||
598 | } | 670 | } |
599 | 671 | ||
600 | static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) | 672 | static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) |
@@ -667,7 +739,8 @@ static char *encode_disk_name(char *ptr, unsigned int n) | |||
667 | 739 | ||
668 | static int xlvbd_alloc_gendisk(blkif_sector_t capacity, | 740 | static int xlvbd_alloc_gendisk(blkif_sector_t capacity, |
669 | struct blkfront_info *info, | 741 | struct blkfront_info *info, |
670 | u16 vdisk_info, u16 sector_size) | 742 | u16 vdisk_info, u16 sector_size, |
743 | unsigned int physical_sector_size) | ||
671 | { | 744 | { |
672 | struct gendisk *gd; | 745 | struct gendisk *gd; |
673 | int nr_minors = 1; | 746 | int nr_minors = 1; |
@@ -734,7 +807,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, | |||
734 | gd->driverfs_dev = &(info->xbdev->dev); | 807 | gd->driverfs_dev = &(info->xbdev->dev); |
735 | set_capacity(gd, capacity); | 808 | set_capacity(gd, capacity); |
736 | 809 | ||
737 | if (xlvbd_init_blk_queue(gd, sector_size)) { | 810 | if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size, |
811 | info->max_indirect_segments ? : | ||
812 | BLKIF_MAX_SEGMENTS_PER_REQUEST)) { | ||
738 | del_gendisk(gd); | 813 | del_gendisk(gd); |
739 | goto release; | 814 | goto release; |
740 | } | 815 | } |
@@ -818,6 +893,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) | |||
818 | { | 893 | { |
819 | struct grant *persistent_gnt; | 894 | struct grant *persistent_gnt; |
820 | struct grant *n; | 895 | struct grant *n; |
896 | int i, j, segs; | ||
821 | 897 | ||
822 | /* Prevent new requests being issued until we fix things up. */ | 898 | /* Prevent new requests being issued until we fix things up. */ |
823 | spin_lock_irq(&info->io_lock); | 899 | spin_lock_irq(&info->io_lock); |
@@ -843,6 +919,47 @@ static void blkif_free(struct blkfront_info *info, int suspend) | |||
843 | } | 919 | } |
844 | BUG_ON(info->persistent_gnts_c != 0); | 920 | BUG_ON(info->persistent_gnts_c != 0); |
845 | 921 | ||
922 | for (i = 0; i < BLK_RING_SIZE; i++) { | ||
923 | /* | ||
924 | * Clear persistent grants present in requests already | ||
925 | * on the shared ring | ||
926 | */ | ||
927 | if (!info->shadow[i].request) | ||
928 | goto free_shadow; | ||
929 | |||
930 | segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? | ||
931 | info->shadow[i].req.u.indirect.nr_segments : | ||
932 | info->shadow[i].req.u.rw.nr_segments; | ||
933 | for (j = 0; j < segs; j++) { | ||
934 | persistent_gnt = info->shadow[i].grants_used[j]; | ||
935 | gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); | ||
936 | __free_page(pfn_to_page(persistent_gnt->pfn)); | ||
937 | kfree(persistent_gnt); | ||
938 | } | ||
939 | |||
940 | if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) | ||
941 | /* | ||
942 | * If this is not an indirect operation don't try to | ||
943 | * free indirect segments | ||
944 | */ | ||
945 | goto free_shadow; | ||
946 | |||
947 | for (j = 0; j < INDIRECT_GREFS(segs); j++) { | ||
948 | persistent_gnt = info->shadow[i].indirect_grants[j]; | ||
949 | gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); | ||
950 | __free_page(pfn_to_page(persistent_gnt->pfn)); | ||
951 | kfree(persistent_gnt); | ||
952 | } | ||
953 | |||
954 | free_shadow: | ||
955 | kfree(info->shadow[i].grants_used); | ||
956 | info->shadow[i].grants_used = NULL; | ||
957 | kfree(info->shadow[i].indirect_grants); | ||
958 | info->shadow[i].indirect_grants = NULL; | ||
959 | kfree(info->shadow[i].sg); | ||
960 | info->shadow[i].sg = NULL; | ||
961 | } | ||
962 | |||
846 | /* No more gnttab callback work. */ | 963 | /* No more gnttab callback work. */ |
847 | gnttab_cancel_free_callback(&info->callback); | 964 | gnttab_cancel_free_callback(&info->callback); |
848 | spin_unlock_irq(&info->io_lock); | 965 | spin_unlock_irq(&info->io_lock); |
@@ -867,12 +984,13 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, | |||
867 | struct blkif_response *bret) | 984 | struct blkif_response *bret) |
868 | { | 985 | { |
869 | int i = 0; | 986 | int i = 0; |
870 | struct bio_vec *bvec; | 987 | struct scatterlist *sg; |
871 | struct req_iterator iter; | ||
872 | unsigned long flags; | ||
873 | char *bvec_data; | 988 | char *bvec_data; |
874 | void *shared_data; | 989 | void *shared_data; |
875 | unsigned int offset = 0; | 990 | int nseg; |
991 | |||
992 | nseg = s->req.operation == BLKIF_OP_INDIRECT ? | ||
993 | s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; | ||
876 | 994 | ||
877 | if (bret->operation == BLKIF_OP_READ) { | 995 | if (bret->operation == BLKIF_OP_READ) { |
878 | /* | 996 | /* |
@@ -881,26 +999,29 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, | |||
881 | * than PAGE_SIZE, we have to keep track of the current offset, | 999 | * than PAGE_SIZE, we have to keep track of the current offset, |
882 | * to be sure we are copying the data from the right shared page. | 1000 | * to be sure we are copying the data from the right shared page. |
883 | */ | 1001 | */ |
884 | rq_for_each_segment(bvec, s->request, iter) { | 1002 | for_each_sg(s->sg, sg, nseg, i) { |
885 | BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE); | 1003 | BUG_ON(sg->offset + sg->length > PAGE_SIZE); |
886 | if (bvec->bv_offset < offset) | ||
887 | i++; | ||
888 | BUG_ON(i >= s->req.u.rw.nr_segments); | ||
889 | shared_data = kmap_atomic( | 1004 | shared_data = kmap_atomic( |
890 | pfn_to_page(s->grants_used[i]->pfn)); | 1005 | pfn_to_page(s->grants_used[i]->pfn)); |
891 | bvec_data = bvec_kmap_irq(bvec, &flags); | 1006 | bvec_data = kmap_atomic(sg_page(sg)); |
892 | memcpy(bvec_data, shared_data + bvec->bv_offset, | 1007 | memcpy(bvec_data + sg->offset, |
893 | bvec->bv_len); | 1008 | shared_data + sg->offset, |
894 | bvec_kunmap_irq(bvec_data, &flags); | 1009 | sg->length); |
1010 | kunmap_atomic(bvec_data); | ||
895 | kunmap_atomic(shared_data); | 1011 | kunmap_atomic(shared_data); |
896 | offset = bvec->bv_offset + bvec->bv_len; | ||
897 | } | 1012 | } |
898 | } | 1013 | } |
899 | /* Add the persistent grant into the list of free grants */ | 1014 | /* Add the persistent grant into the list of free grants */ |
900 | for (i = 0; i < s->req.u.rw.nr_segments; i++) { | 1015 | for (i = 0; i < nseg; i++) { |
901 | list_add(&s->grants_used[i]->node, &info->persistent_gnts); | 1016 | list_add(&s->grants_used[i]->node, &info->persistent_gnts); |
902 | info->persistent_gnts_c++; | 1017 | info->persistent_gnts_c++; |
903 | } | 1018 | } |
1019 | if (s->req.operation == BLKIF_OP_INDIRECT) { | ||
1020 | for (i = 0; i < INDIRECT_GREFS(nseg); i++) { | ||
1021 | list_add(&s->indirect_grants[i]->node, &info->persistent_gnts); | ||
1022 | info->persistent_gnts_c++; | ||
1023 | } | ||
1024 | } | ||
904 | } | 1025 | } |
905 | 1026 | ||
906 | static irqreturn_t blkif_interrupt(int irq, void *dev_id) | 1027 | static irqreturn_t blkif_interrupt(int irq, void *dev_id) |
@@ -1034,14 +1155,6 @@ static int setup_blkring(struct xenbus_device *dev, | |||
1034 | SHARED_RING_INIT(sring); | 1155 | SHARED_RING_INIT(sring); |
1035 | FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); | 1156 | FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); |
1036 | 1157 | ||
1037 | sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
1038 | |||
1039 | /* Allocate memory for grants */ | ||
1040 | err = fill_grant_buffer(info, BLK_RING_SIZE * | ||
1041 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
1042 | if (err) | ||
1043 | goto fail; | ||
1044 | |||
1045 | err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); | 1158 | err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); |
1046 | if (err < 0) { | 1159 | if (err < 0) { |
1047 | free_page((unsigned long)sring); | 1160 | free_page((unsigned long)sring); |
@@ -1223,13 +1336,84 @@ static int blkfront_probe(struct xenbus_device *dev, | |||
1223 | return 0; | 1336 | return 0; |
1224 | } | 1337 | } |
1225 | 1338 | ||
1339 | /* | ||
1340 | * This is a clone of md_trim_bio, used to split a bio into smaller ones | ||
1341 | */ | ||
1342 | static void trim_bio(struct bio *bio, int offset, int size) | ||
1343 | { | ||
1344 | /* 'bio' is a cloned bio which we need to trim to match | ||
1345 | * the given offset and size. | ||
1346 | * This requires adjusting bi_sector, bi_size, and bi_io_vec | ||
1347 | */ | ||
1348 | int i; | ||
1349 | struct bio_vec *bvec; | ||
1350 | int sofar = 0; | ||
1351 | |||
1352 | size <<= 9; | ||
1353 | if (offset == 0 && size == bio->bi_size) | ||
1354 | return; | ||
1355 | |||
1356 | bio->bi_sector += offset; | ||
1357 | bio->bi_size = size; | ||
1358 | offset <<= 9; | ||
1359 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
1360 | |||
1361 | while (bio->bi_idx < bio->bi_vcnt && | ||
1362 | bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { | ||
1363 | /* remove this whole bio_vec */ | ||
1364 | offset -= bio->bi_io_vec[bio->bi_idx].bv_len; | ||
1365 | bio->bi_idx++; | ||
1366 | } | ||
1367 | if (bio->bi_idx < bio->bi_vcnt) { | ||
1368 | bio->bi_io_vec[bio->bi_idx].bv_offset += offset; | ||
1369 | bio->bi_io_vec[bio->bi_idx].bv_len -= offset; | ||
1370 | } | ||
1371 | /* avoid any complications with bi_idx being non-zero*/ | ||
1372 | if (bio->bi_idx) { | ||
1373 | memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, | ||
1374 | (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); | ||
1375 | bio->bi_vcnt -= bio->bi_idx; | ||
1376 | bio->bi_idx = 0; | ||
1377 | } | ||
1378 | /* Make sure vcnt and last bv are not too big */ | ||
1379 | bio_for_each_segment(bvec, bio, i) { | ||
1380 | if (sofar + bvec->bv_len > size) | ||
1381 | bvec->bv_len = size - sofar; | ||
1382 | if (bvec->bv_len == 0) { | ||
1383 | bio->bi_vcnt = i; | ||
1384 | break; | ||
1385 | } | ||
1386 | sofar += bvec->bv_len; | ||
1387 | } | ||
1388 | } | ||
1389 | |||
1390 | static void split_bio_end(struct bio *bio, int error) | ||
1391 | { | ||
1392 | struct split_bio *split_bio = bio->bi_private; | ||
1393 | |||
1394 | if (error) | ||
1395 | split_bio->err = error; | ||
1396 | |||
1397 | if (atomic_dec_and_test(&split_bio->pending)) { | ||
1398 | split_bio->bio->bi_phys_segments = 0; | ||
1399 | bio_endio(split_bio->bio, split_bio->err); | ||
1400 | kfree(split_bio); | ||
1401 | } | ||
1402 | bio_put(bio); | ||
1403 | } | ||
1226 | 1404 | ||
1227 | static int blkif_recover(struct blkfront_info *info) | 1405 | static int blkif_recover(struct blkfront_info *info) |
1228 | { | 1406 | { |
1229 | int i; | 1407 | int i; |
1230 | struct blkif_request *req; | 1408 | struct request *req, *n; |
1231 | struct blk_shadow *copy; | 1409 | struct blk_shadow *copy; |
1232 | int j; | 1410 | int rc; |
1411 | struct bio *bio, *cloned_bio; | ||
1412 | struct bio_list bio_list, merge_bio; | ||
1413 | unsigned int segs, offset; | ||
1414 | int pending, size; | ||
1415 | struct split_bio *split_bio; | ||
1416 | struct list_head requests; | ||
1233 | 1417 | ||
1234 | /* Stage 1: Make a safe copy of the shadow state. */ | 1418 | /* Stage 1: Make a safe copy of the shadow state. */ |
1235 | copy = kmemdup(info->shadow, sizeof(info->shadow), | 1419 | copy = kmemdup(info->shadow, sizeof(info->shadow), |
@@ -1244,36 +1428,64 @@ static int blkif_recover(struct blkfront_info *info) | |||
1244 | info->shadow_free = info->ring.req_prod_pvt; | 1428 | info->shadow_free = info->ring.req_prod_pvt; |
1245 | info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; | 1429 | info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; |
1246 | 1430 | ||
1247 | /* Stage 3: Find pending requests and requeue them. */ | 1431 | rc = blkfront_setup_indirect(info); |
1432 | if (rc) { | ||
1433 | kfree(copy); | ||
1434 | return rc; | ||
1435 | } | ||
1436 | |||
1437 | segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
1438 | blk_queue_max_segments(info->rq, segs); | ||
1439 | bio_list_init(&bio_list); | ||
1440 | INIT_LIST_HEAD(&requests); | ||
1248 | for (i = 0; i < BLK_RING_SIZE; i++) { | 1441 | for (i = 0; i < BLK_RING_SIZE; i++) { |
1249 | /* Not in use? */ | 1442 | /* Not in use? */ |
1250 | if (!copy[i].request) | 1443 | if (!copy[i].request) |
1251 | continue; | 1444 | continue; |
1252 | 1445 | ||
1253 | /* Grab a request slot and copy shadow state into it. */ | 1446 | /* |
1254 | req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); | 1447 | * Get the bios in the request so we can re-queue them. |
1255 | *req = copy[i].req; | 1448 | */ |
1256 | 1449 | if (copy[i].request->cmd_flags & | |
1257 | /* We get a new request id, and must reset the shadow state. */ | 1450 | (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { |
1258 | req->u.rw.id = get_id_from_freelist(info); | 1451 | /* |
1259 | memcpy(&info->shadow[req->u.rw.id], ©[i], sizeof(copy[i])); | 1452 | * Flush operations don't contain bios, so |
1260 | 1453 | * we need to requeue the whole request | |
1261 | if (req->operation != BLKIF_OP_DISCARD) { | 1454 | */ |
1262 | /* Rewrite any grant references invalidated by susp/resume. */ | 1455 | list_add(©[i].request->queuelist, &requests); |
1263 | for (j = 0; j < req->u.rw.nr_segments; j++) | 1456 | continue; |
1264 | gnttab_grant_foreign_access_ref( | ||
1265 | req->u.rw.seg[j].gref, | ||
1266 | info->xbdev->otherend_id, | ||
1267 | pfn_to_mfn(copy[i].grants_used[j]->pfn), | ||
1268 | 0); | ||
1269 | } | 1457 | } |
1270 | info->shadow[req->u.rw.id].req = *req; | 1458 | merge_bio.head = copy[i].request->bio; |
1271 | 1459 | merge_bio.tail = copy[i].request->biotail; | |
1272 | info->ring.req_prod_pvt++; | 1460 | bio_list_merge(&bio_list, &merge_bio); |
1461 | copy[i].request->bio = NULL; | ||
1462 | blk_put_request(copy[i].request); | ||
1273 | } | 1463 | } |
1274 | 1464 | ||
1275 | kfree(copy); | 1465 | kfree(copy); |
1276 | 1466 | ||
1467 | /* | ||
1468 | * Empty the queue, this is important because we might have | ||
1469 | * requests in the queue with more segments than what we | ||
1470 | * can handle now. | ||
1471 | */ | ||
1472 | spin_lock_irq(&info->io_lock); | ||
1473 | while ((req = blk_fetch_request(info->rq)) != NULL) { | ||
1474 | if (req->cmd_flags & | ||
1475 | (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { | ||
1476 | list_add(&req->queuelist, &requests); | ||
1477 | continue; | ||
1478 | } | ||
1479 | merge_bio.head = req->bio; | ||
1480 | merge_bio.tail = req->biotail; | ||
1481 | bio_list_merge(&bio_list, &merge_bio); | ||
1482 | req->bio = NULL; | ||
1483 | if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) | ||
1484 | pr_alert("diskcache flush request found!\n"); | ||
1485 | __blk_put_request(info->rq, req); | ||
1486 | } | ||
1487 | spin_unlock_irq(&info->io_lock); | ||
1488 | |||
1277 | xenbus_switch_state(info->xbdev, XenbusStateConnected); | 1489 | xenbus_switch_state(info->xbdev, XenbusStateConnected); |
1278 | 1490 | ||
1279 | spin_lock_irq(&info->io_lock); | 1491 | spin_lock_irq(&info->io_lock); |
@@ -1281,14 +1493,50 @@ static int blkif_recover(struct blkfront_info *info) | |||
1281 | /* Now safe for us to use the shared ring */ | 1493 | /* Now safe for us to use the shared ring */ |
1282 | info->connected = BLKIF_STATE_CONNECTED; | 1494 | info->connected = BLKIF_STATE_CONNECTED; |
1283 | 1495 | ||
1284 | /* Send off requeued requests */ | ||
1285 | flush_requests(info); | ||
1286 | |||
1287 | /* Kick any other new requests queued since we resumed */ | 1496 | /* Kick any other new requests queued since we resumed */ |
1288 | kick_pending_request_queues(info); | 1497 | kick_pending_request_queues(info); |
1289 | 1498 | ||
1499 | list_for_each_entry_safe(req, n, &requests, queuelist) { | ||
1500 | /* Requeue pending requests (flush or discard) */ | ||
1501 | list_del_init(&req->queuelist); | ||
1502 | BUG_ON(req->nr_phys_segments > segs); | ||
1503 | blk_requeue_request(info->rq, req); | ||
1504 | } | ||
1290 | spin_unlock_irq(&info->io_lock); | 1505 | spin_unlock_irq(&info->io_lock); |
1291 | 1506 | ||
1507 | while ((bio = bio_list_pop(&bio_list)) != NULL) { | ||
1508 | /* Traverse the list of pending bios and re-queue them */ | ||
1509 | if (bio_segments(bio) > segs) { | ||
1510 | /* | ||
1511 | * This bio has more segments than what we can | ||
1512 | * handle, we have to split it. | ||
1513 | */ | ||
1514 | pending = (bio_segments(bio) + segs - 1) / segs; | ||
1515 | split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO); | ||
1516 | BUG_ON(split_bio == NULL); | ||
1517 | atomic_set(&split_bio->pending, pending); | ||
1518 | split_bio->bio = bio; | ||
1519 | for (i = 0; i < pending; i++) { | ||
1520 | offset = (i * segs * PAGE_SIZE) >> 9; | ||
1521 | size = min((unsigned int)(segs * PAGE_SIZE) >> 9, | ||
1522 | (unsigned int)(bio->bi_size >> 9) - offset); | ||
1523 | cloned_bio = bio_clone(bio, GFP_NOIO); | ||
1524 | BUG_ON(cloned_bio == NULL); | ||
1525 | trim_bio(cloned_bio, offset, size); | ||
1526 | cloned_bio->bi_private = split_bio; | ||
1527 | cloned_bio->bi_end_io = split_bio_end; | ||
1528 | submit_bio(cloned_bio->bi_rw, cloned_bio); | ||
1529 | } | ||
1530 | /* | ||
1531 | * Now we have to wait for all those smaller bios to | ||
1532 | * end, so we can also end the "parent" bio. | ||
1533 | */ | ||
1534 | continue; | ||
1535 | } | ||
1536 | /* We don't need to split this bio */ | ||
1537 | submit_bio(bio->bi_rw, bio); | ||
1538 | } | ||
1539 | |||
1292 | return 0; | 1540 | return 0; |
1293 | } | 1541 | } |
1294 | 1542 | ||
@@ -1308,8 +1556,12 @@ static int blkfront_resume(struct xenbus_device *dev) | |||
1308 | blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); | 1556 | blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); |
1309 | 1557 | ||
1310 | err = talk_to_blkback(dev, info); | 1558 | err = talk_to_blkback(dev, info); |
1311 | if (info->connected == BLKIF_STATE_SUSPENDED && !err) | 1559 | |
1312 | err = blkif_recover(info); | 1560 | /* |
1561 | * We have to wait for the backend to switch to | ||
1562 | * connected state, since we want to read which | ||
1563 | * features it supports. | ||
1564 | */ | ||
1313 | 1565 | ||
1314 | return err; | 1566 | return err; |
1315 | } | 1567 | } |
@@ -1387,6 +1639,60 @@ static void blkfront_setup_discard(struct blkfront_info *info) | |||
1387 | kfree(type); | 1639 | kfree(type); |
1388 | } | 1640 | } |
1389 | 1641 | ||
1642 | static int blkfront_setup_indirect(struct blkfront_info *info) | ||
1643 | { | ||
1644 | unsigned int indirect_segments, segs; | ||
1645 | int err, i; | ||
1646 | |||
1647 | err = xenbus_gather(XBT_NIL, info->xbdev->otherend, | ||
1648 | "feature-max-indirect-segments", "%u", &indirect_segments, | ||
1649 | NULL); | ||
1650 | if (err) { | ||
1651 | info->max_indirect_segments = 0; | ||
1652 | segs = BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
1653 | } else { | ||
1654 | info->max_indirect_segments = min(indirect_segments, | ||
1655 | xen_blkif_max_segments); | ||
1656 | segs = info->max_indirect_segments; | ||
1657 | } | ||
1658 | |||
1659 | err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE); | ||
1660 | if (err) | ||
1661 | goto out_of_memory; | ||
1662 | |||
1663 | for (i = 0; i < BLK_RING_SIZE; i++) { | ||
1664 | info->shadow[i].grants_used = kzalloc( | ||
1665 | sizeof(info->shadow[i].grants_used[0]) * segs, | ||
1666 | GFP_NOIO); | ||
1667 | info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO); | ||
1668 | if (info->max_indirect_segments) | ||
1669 | info->shadow[i].indirect_grants = kzalloc( | ||
1670 | sizeof(info->shadow[i].indirect_grants[0]) * | ||
1671 | INDIRECT_GREFS(segs), | ||
1672 | GFP_NOIO); | ||
1673 | if ((info->shadow[i].grants_used == NULL) || | ||
1674 | (info->shadow[i].sg == NULL) || | ||
1675 | (info->max_indirect_segments && | ||
1676 | (info->shadow[i].indirect_grants == NULL))) | ||
1677 | goto out_of_memory; | ||
1678 | sg_init_table(info->shadow[i].sg, segs); | ||
1679 | } | ||
1680 | |||
1681 | |||
1682 | return 0; | ||
1683 | |||
1684 | out_of_memory: | ||
1685 | for (i = 0; i < BLK_RING_SIZE; i++) { | ||
1686 | kfree(info->shadow[i].grants_used); | ||
1687 | info->shadow[i].grants_used = NULL; | ||
1688 | kfree(info->shadow[i].sg); | ||
1689 | info->shadow[i].sg = NULL; | ||
1690 | kfree(info->shadow[i].indirect_grants); | ||
1691 | info->shadow[i].indirect_grants = NULL; | ||
1692 | } | ||
1693 | return -ENOMEM; | ||
1694 | } | ||
1695 | |||
1390 | /* | 1696 | /* |
1391 | * Invoked when the backend is finally 'ready' (and has told produced | 1697 | * Invoked when the backend is finally 'ready' (and has told produced |
1392 | * the details about the physical device - #sectors, size, etc). | 1698 | * the details about the physical device - #sectors, size, etc). |
@@ -1395,6 +1701,7 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1395 | { | 1701 | { |
1396 | unsigned long long sectors; | 1702 | unsigned long long sectors; |
1397 | unsigned long sector_size; | 1703 | unsigned long sector_size; |
1704 | unsigned int physical_sector_size; | ||
1398 | unsigned int binfo; | 1705 | unsigned int binfo; |
1399 | int err; | 1706 | int err; |
1400 | int barrier, flush, discard, persistent; | 1707 | int barrier, flush, discard, persistent; |
@@ -1414,8 +1721,15 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1414 | set_capacity(info->gd, sectors); | 1721 | set_capacity(info->gd, sectors); |
1415 | revalidate_disk(info->gd); | 1722 | revalidate_disk(info->gd); |
1416 | 1723 | ||
1417 | /* fall through */ | 1724 | return; |
1418 | case BLKIF_STATE_SUSPENDED: | 1725 | case BLKIF_STATE_SUSPENDED: |
1726 | /* | ||
1727 | * If we are recovering from suspension, we need to wait | ||
1728 | * for the backend to announce it's features before | ||
1729 | * reconnecting, at least we need to know if the backend | ||
1730 | * supports indirect descriptors, and how many. | ||
1731 | */ | ||
1732 | blkif_recover(info); | ||
1419 | return; | 1733 | return; |
1420 | 1734 | ||
1421 | default: | 1735 | default: |
@@ -1437,6 +1751,16 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1437 | return; | 1751 | return; |
1438 | } | 1752 | } |
1439 | 1753 | ||
1754 | /* | ||
1755 | * physcial-sector-size is a newer field, so old backends may not | ||
1756 | * provide this. Assume physical sector size to be the same as | ||
1757 | * sector_size in that case. | ||
1758 | */ | ||
1759 | err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, | ||
1760 | "physical-sector-size", "%u", &physical_sector_size); | ||
1761 | if (err != 1) | ||
1762 | physical_sector_size = sector_size; | ||
1763 | |||
1440 | info->feature_flush = 0; | 1764 | info->feature_flush = 0; |
1441 | info->flush_op = 0; | 1765 | info->flush_op = 0; |
1442 | 1766 | ||
@@ -1483,7 +1807,15 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1483 | else | 1807 | else |
1484 | info->feature_persistent = persistent; | 1808 | info->feature_persistent = persistent; |
1485 | 1809 | ||
1486 | err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); | 1810 | err = blkfront_setup_indirect(info); |
1811 | if (err) { | ||
1812 | xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", | ||
1813 | info->xbdev->otherend); | ||
1814 | return; | ||
1815 | } | ||
1816 | |||
1817 | err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, | ||
1818 | physical_sector_size); | ||
1487 | if (err) { | 1819 | if (err) { |
1488 | xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", | 1820 | xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", |
1489 | info->xbdev->otherend); | 1821 | info->xbdev->otherend); |
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 048f2947e08b..e45f5575fd4d 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c | |||
@@ -63,7 +63,10 @@ | |||
63 | #include "bcache.h" | 63 | #include "bcache.h" |
64 | #include "btree.h" | 64 | #include "btree.h" |
65 | 65 | ||
66 | #include <linux/freezer.h> | ||
67 | #include <linux/kthread.h> | ||
66 | #include <linux/random.h> | 68 | #include <linux/random.h> |
69 | #include <trace/events/bcache.h> | ||
67 | 70 | ||
68 | #define MAX_IN_FLIGHT_DISCARDS 8U | 71 | #define MAX_IN_FLIGHT_DISCARDS 8U |
69 | 72 | ||
@@ -151,7 +154,7 @@ static void discard_finish(struct work_struct *w) | |||
151 | mutex_unlock(&ca->set->bucket_lock); | 154 | mutex_unlock(&ca->set->bucket_lock); |
152 | 155 | ||
153 | closure_wake_up(&ca->set->bucket_wait); | 156 | closure_wake_up(&ca->set->bucket_wait); |
154 | wake_up(&ca->set->alloc_wait); | 157 | wake_up_process(ca->alloc_thread); |
155 | 158 | ||
156 | closure_put(&ca->set->cl); | 159 | closure_put(&ca->set->cl); |
157 | } | 160 | } |
@@ -350,38 +353,30 @@ static void invalidate_buckets(struct cache *ca) | |||
350 | break; | 353 | break; |
351 | } | 354 | } |
352 | 355 | ||
353 | pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu", | 356 | trace_bcache_alloc_invalidate(ca); |
354 | fifo_used(&ca->free), ca->free.size, | ||
355 | fifo_used(&ca->free_inc), ca->free_inc.size, | ||
356 | fifo_used(&ca->unused), ca->unused.size); | ||
357 | } | 357 | } |
358 | 358 | ||
359 | #define allocator_wait(ca, cond) \ | 359 | #define allocator_wait(ca, cond) \ |
360 | do { \ | 360 | do { \ |
361 | DEFINE_WAIT(__wait); \ | ||
362 | \ | ||
363 | while (1) { \ | 361 | while (1) { \ |
364 | prepare_to_wait(&ca->set->alloc_wait, \ | 362 | set_current_state(TASK_INTERRUPTIBLE); \ |
365 | &__wait, TASK_INTERRUPTIBLE); \ | ||
366 | if (cond) \ | 363 | if (cond) \ |
367 | break; \ | 364 | break; \ |
368 | \ | 365 | \ |
369 | mutex_unlock(&(ca)->set->bucket_lock); \ | 366 | mutex_unlock(&(ca)->set->bucket_lock); \ |
370 | if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \ | 367 | if (kthread_should_stop()) \ |
371 | finish_wait(&ca->set->alloc_wait, &__wait); \ | 368 | return 0; \ |
372 | closure_return(cl); \ | ||
373 | } \ | ||
374 | \ | 369 | \ |
370 | try_to_freeze(); \ | ||
375 | schedule(); \ | 371 | schedule(); \ |
376 | mutex_lock(&(ca)->set->bucket_lock); \ | 372 | mutex_lock(&(ca)->set->bucket_lock); \ |
377 | } \ | 373 | } \ |
378 | \ | 374 | __set_current_state(TASK_RUNNING); \ |
379 | finish_wait(&ca->set->alloc_wait, &__wait); \ | ||
380 | } while (0) | 375 | } while (0) |
381 | 376 | ||
382 | void bch_allocator_thread(struct closure *cl) | 377 | static int bch_allocator_thread(void *arg) |
383 | { | 378 | { |
384 | struct cache *ca = container_of(cl, struct cache, alloc); | 379 | struct cache *ca = arg; |
385 | 380 | ||
386 | mutex_lock(&ca->set->bucket_lock); | 381 | mutex_lock(&ca->set->bucket_lock); |
387 | 382 | ||
@@ -442,7 +437,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) | |||
442 | { | 437 | { |
443 | long r = -1; | 438 | long r = -1; |
444 | again: | 439 | again: |
445 | wake_up(&ca->set->alloc_wait); | 440 | wake_up_process(ca->alloc_thread); |
446 | 441 | ||
447 | if (fifo_used(&ca->free) > ca->watermark[watermark] && | 442 | if (fifo_used(&ca->free) > ca->watermark[watermark] && |
448 | fifo_pop(&ca->free, r)) { | 443 | fifo_pop(&ca->free, r)) { |
@@ -476,9 +471,7 @@ again: | |||
476 | return r; | 471 | return r; |
477 | } | 472 | } |
478 | 473 | ||
479 | pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu", | 474 | trace_bcache_alloc_fail(ca); |
480 | atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free), | ||
481 | fifo_used(&ca->free_inc), fifo_used(&ca->unused)); | ||
482 | 475 | ||
483 | if (cl) { | 476 | if (cl) { |
484 | closure_wait(&ca->set->bucket_wait, cl); | 477 | closure_wait(&ca->set->bucket_wait, cl); |
@@ -552,6 +545,17 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | |||
552 | 545 | ||
553 | /* Init */ | 546 | /* Init */ |
554 | 547 | ||
548 | int bch_cache_allocator_start(struct cache *ca) | ||
549 | { | ||
550 | struct task_struct *k = kthread_run(bch_allocator_thread, | ||
551 | ca, "bcache_allocator"); | ||
552 | if (IS_ERR(k)) | ||
553 | return PTR_ERR(k); | ||
554 | |||
555 | ca->alloc_thread = k; | ||
556 | return 0; | ||
557 | } | ||
558 | |||
555 | void bch_cache_allocator_exit(struct cache *ca) | 559 | void bch_cache_allocator_exit(struct cache *ca) |
556 | { | 560 | { |
557 | struct discard *d; | 561 | struct discard *d; |
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d3e15b42a4ab..b39f6f0b45f2 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h | |||
@@ -178,7 +178,6 @@ | |||
178 | #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ | 178 | #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ |
179 | 179 | ||
180 | #include <linux/bio.h> | 180 | #include <linux/bio.h> |
181 | #include <linux/blktrace_api.h> | ||
182 | #include <linux/kobject.h> | 181 | #include <linux/kobject.h> |
183 | #include <linux/list.h> | 182 | #include <linux/list.h> |
184 | #include <linux/mutex.h> | 183 | #include <linux/mutex.h> |
@@ -388,8 +387,6 @@ struct keybuf_key { | |||
388 | typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); | 387 | typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); |
389 | 388 | ||
390 | struct keybuf { | 389 | struct keybuf { |
391 | keybuf_pred_fn *key_predicate; | ||
392 | |||
393 | struct bkey last_scanned; | 390 | struct bkey last_scanned; |
394 | spinlock_t lock; | 391 | spinlock_t lock; |
395 | 392 | ||
@@ -437,9 +434,12 @@ struct bcache_device { | |||
437 | 434 | ||
438 | /* If nonzero, we're detaching/unregistering from cache set */ | 435 | /* If nonzero, we're detaching/unregistering from cache set */ |
439 | atomic_t detaching; | 436 | atomic_t detaching; |
437 | int flush_done; | ||
438 | |||
439 | uint64_t nr_stripes; | ||
440 | unsigned stripe_size_bits; | ||
441 | atomic_t *stripe_sectors_dirty; | ||
440 | 442 | ||
441 | atomic_long_t sectors_dirty; | ||
442 | unsigned long sectors_dirty_gc; | ||
443 | unsigned long sectors_dirty_last; | 443 | unsigned long sectors_dirty_last; |
444 | long sectors_dirty_derivative; | 444 | long sectors_dirty_derivative; |
445 | 445 | ||
@@ -531,6 +531,7 @@ struct cached_dev { | |||
531 | unsigned sequential_merge:1; | 531 | unsigned sequential_merge:1; |
532 | unsigned verify:1; | 532 | unsigned verify:1; |
533 | 533 | ||
534 | unsigned partial_stripes_expensive:1; | ||
534 | unsigned writeback_metadata:1; | 535 | unsigned writeback_metadata:1; |
535 | unsigned writeback_running:1; | 536 | unsigned writeback_running:1; |
536 | unsigned char writeback_percent; | 537 | unsigned char writeback_percent; |
@@ -565,8 +566,7 @@ struct cache { | |||
565 | 566 | ||
566 | unsigned watermark[WATERMARK_MAX]; | 567 | unsigned watermark[WATERMARK_MAX]; |
567 | 568 | ||
568 | struct closure alloc; | 569 | struct task_struct *alloc_thread; |
569 | struct workqueue_struct *alloc_workqueue; | ||
570 | 570 | ||
571 | struct closure prio; | 571 | struct closure prio; |
572 | struct prio_set *disk_buckets; | 572 | struct prio_set *disk_buckets; |
@@ -664,13 +664,9 @@ struct gc_stat { | |||
664 | * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; | 664 | * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; |
665 | * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. | 665 | * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. |
666 | * flushing dirty data). | 666 | * flushing dirty data). |
667 | * | ||
668 | * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down | ||
669 | * the allocation thread. | ||
670 | */ | 667 | */ |
671 | #define CACHE_SET_UNREGISTERING 0 | 668 | #define CACHE_SET_UNREGISTERING 0 |
672 | #define CACHE_SET_STOPPING 1 | 669 | #define CACHE_SET_STOPPING 1 |
673 | #define CACHE_SET_STOPPING_2 2 | ||
674 | 670 | ||
675 | struct cache_set { | 671 | struct cache_set { |
676 | struct closure cl; | 672 | struct closure cl; |
@@ -703,9 +699,6 @@ struct cache_set { | |||
703 | /* For the btree cache */ | 699 | /* For the btree cache */ |
704 | struct shrinker shrink; | 700 | struct shrinker shrink; |
705 | 701 | ||
706 | /* For the allocator itself */ | ||
707 | wait_queue_head_t alloc_wait; | ||
708 | |||
709 | /* For the btree cache and anything allocation related */ | 702 | /* For the btree cache and anything allocation related */ |
710 | struct mutex bucket_lock; | 703 | struct mutex bucket_lock; |
711 | 704 | ||
@@ -823,10 +816,9 @@ struct cache_set { | |||
823 | 816 | ||
824 | /* | 817 | /* |
825 | * A btree node on disk could have too many bsets for an iterator to fit | 818 | * A btree node on disk could have too many bsets for an iterator to fit |
826 | * on the stack - this is a single element mempool for btree_read_work() | 819 | * on the stack - have to dynamically allocate them |
827 | */ | 820 | */ |
828 | struct mutex fill_lock; | 821 | mempool_t *fill_iter; |
829 | struct btree_iter *fill_iter; | ||
830 | 822 | ||
831 | /* | 823 | /* |
832 | * btree_sort() is a merge sort and requires temporary space - single | 824 | * btree_sort() is a merge sort and requires temporary space - single |
@@ -834,6 +826,7 @@ struct cache_set { | |||
834 | */ | 826 | */ |
835 | struct mutex sort_lock; | 827 | struct mutex sort_lock; |
836 | struct bset *sort; | 828 | struct bset *sort; |
829 | unsigned sort_crit_factor; | ||
837 | 830 | ||
838 | /* List of buckets we're currently writing data to */ | 831 | /* List of buckets we're currently writing data to */ |
839 | struct list_head data_buckets; | 832 | struct list_head data_buckets; |
@@ -906,8 +899,6 @@ static inline unsigned local_clock_us(void) | |||
906 | return local_clock() >> 10; | 899 | return local_clock() >> 10; |
907 | } | 900 | } |
908 | 901 | ||
909 | #define MAX_BSETS 4U | ||
910 | |||
911 | #define BTREE_PRIO USHRT_MAX | 902 | #define BTREE_PRIO USHRT_MAX |
912 | #define INITIAL_PRIO 32768 | 903 | #define INITIAL_PRIO 32768 |
913 | 904 | ||
@@ -1112,23 +1103,6 @@ static inline void __bkey_put(struct cache_set *c, struct bkey *k) | |||
1112 | atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); | 1103 | atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); |
1113 | } | 1104 | } |
1114 | 1105 | ||
1115 | /* Blktrace macros */ | ||
1116 | |||
1117 | #define blktrace_msg(c, fmt, ...) \ | ||
1118 | do { \ | ||
1119 | struct request_queue *q = bdev_get_queue(c->bdev); \ | ||
1120 | if (q) \ | ||
1121 | blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \ | ||
1122 | } while (0) | ||
1123 | |||
1124 | #define blktrace_msg_all(s, fmt, ...) \ | ||
1125 | do { \ | ||
1126 | struct cache *_c; \ | ||
1127 | unsigned i; \ | ||
1128 | for_each_cache(_c, (s), i) \ | ||
1129 | blktrace_msg(_c, fmt, ##__VA_ARGS__); \ | ||
1130 | } while (0) | ||
1131 | |||
1132 | static inline void cached_dev_put(struct cached_dev *dc) | 1106 | static inline void cached_dev_put(struct cached_dev *dc) |
1133 | { | 1107 | { |
1134 | if (atomic_dec_and_test(&dc->count)) | 1108 | if (atomic_dec_and_test(&dc->count)) |
@@ -1173,10 +1147,16 @@ static inline uint8_t bucket_disk_gen(struct bucket *b) | |||
1173 | static struct kobj_attribute ksysfs_##n = \ | 1147 | static struct kobj_attribute ksysfs_##n = \ |
1174 | __ATTR(n, S_IWUSR|S_IRUSR, show, store) | 1148 | __ATTR(n, S_IWUSR|S_IRUSR, show, store) |
1175 | 1149 | ||
1176 | /* Forward declarations */ | 1150 | static inline void wake_up_allocators(struct cache_set *c) |
1151 | { | ||
1152 | struct cache *ca; | ||
1153 | unsigned i; | ||
1154 | |||
1155 | for_each_cache(ca, c, i) | ||
1156 | wake_up_process(ca->alloc_thread); | ||
1157 | } | ||
1177 | 1158 | ||
1178 | void bch_writeback_queue(struct cached_dev *); | 1159 | /* Forward declarations */ |
1179 | void bch_writeback_add(struct cached_dev *, unsigned); | ||
1180 | 1160 | ||
1181 | void bch_count_io_errors(struct cache *, int, const char *); | 1161 | void bch_count_io_errors(struct cache *, int, const char *); |
1182 | void bch_bbio_count_io_errors(struct cache_set *, struct bio *, | 1162 | void bch_bbio_count_io_errors(struct cache_set *, struct bio *, |
@@ -1193,7 +1173,6 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); | |||
1193 | uint8_t bch_inc_gen(struct cache *, struct bucket *); | 1173 | uint8_t bch_inc_gen(struct cache *, struct bucket *); |
1194 | void bch_rescale_priorities(struct cache_set *, int); | 1174 | void bch_rescale_priorities(struct cache_set *, int); |
1195 | bool bch_bucket_add_unused(struct cache *, struct bucket *); | 1175 | bool bch_bucket_add_unused(struct cache *, struct bucket *); |
1196 | void bch_allocator_thread(struct closure *); | ||
1197 | 1176 | ||
1198 | long bch_bucket_alloc(struct cache *, unsigned, struct closure *); | 1177 | long bch_bucket_alloc(struct cache *, unsigned, struct closure *); |
1199 | void bch_bucket_free(struct cache_set *, struct bkey *); | 1178 | void bch_bucket_free(struct cache_set *, struct bkey *); |
@@ -1241,9 +1220,9 @@ void bch_cache_set_stop(struct cache_set *); | |||
1241 | struct cache_set *bch_cache_set_alloc(struct cache_sb *); | 1220 | struct cache_set *bch_cache_set_alloc(struct cache_sb *); |
1242 | void bch_btree_cache_free(struct cache_set *); | 1221 | void bch_btree_cache_free(struct cache_set *); |
1243 | int bch_btree_cache_alloc(struct cache_set *); | 1222 | int bch_btree_cache_alloc(struct cache_set *); |
1244 | void bch_cached_dev_writeback_init(struct cached_dev *); | ||
1245 | void bch_moving_init_cache_set(struct cache_set *); | 1223 | void bch_moving_init_cache_set(struct cache_set *); |
1246 | 1224 | ||
1225 | int bch_cache_allocator_start(struct cache *ca); | ||
1247 | void bch_cache_allocator_exit(struct cache *ca); | 1226 | void bch_cache_allocator_exit(struct cache *ca); |
1248 | int bch_cache_allocator_init(struct cache *ca); | 1227 | int bch_cache_allocator_init(struct cache *ca); |
1249 | 1228 | ||
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 1d27d3af3251..8010eed06a51 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c | |||
@@ -78,6 +78,7 @@ struct bkey *bch_keylist_pop(struct keylist *l) | |||
78 | bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) | 78 | bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) |
79 | { | 79 | { |
80 | unsigned i; | 80 | unsigned i; |
81 | char buf[80]; | ||
81 | 82 | ||
82 | if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) | 83 | if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) |
83 | goto bad; | 84 | goto bad; |
@@ -102,7 +103,8 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) | |||
102 | 103 | ||
103 | return false; | 104 | return false; |
104 | bad: | 105 | bad: |
105 | cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k)); | 106 | bch_bkey_to_text(buf, sizeof(buf), k); |
107 | cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k)); | ||
106 | return true; | 108 | return true; |
107 | } | 109 | } |
108 | 110 | ||
@@ -162,10 +164,16 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k) | |||
162 | #ifdef CONFIG_BCACHE_EDEBUG | 164 | #ifdef CONFIG_BCACHE_EDEBUG |
163 | bug: | 165 | bug: |
164 | mutex_unlock(&b->c->bucket_lock); | 166 | mutex_unlock(&b->c->bucket_lock); |
165 | btree_bug(b, | 167 | |
168 | { | ||
169 | char buf[80]; | ||
170 | |||
171 | bch_bkey_to_text(buf, sizeof(buf), k); | ||
172 | btree_bug(b, | ||
166 | "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | 173 | "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", |
167 | pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), | 174 | buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), |
168 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | 175 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); |
176 | } | ||
169 | return true; | 177 | return true; |
170 | #endif | 178 | #endif |
171 | } | 179 | } |
@@ -1084,33 +1092,39 @@ void bch_btree_sort_into(struct btree *b, struct btree *new) | |||
1084 | new->sets->size = 0; | 1092 | new->sets->size = 0; |
1085 | } | 1093 | } |
1086 | 1094 | ||
1095 | #define SORT_CRIT (4096 / sizeof(uint64_t)) | ||
1096 | |||
1087 | void bch_btree_sort_lazy(struct btree *b) | 1097 | void bch_btree_sort_lazy(struct btree *b) |
1088 | { | 1098 | { |
1089 | if (b->nsets) { | 1099 | unsigned crit = SORT_CRIT; |
1090 | unsigned i, j, keys = 0, total; | 1100 | int i; |
1091 | 1101 | ||
1092 | for (i = 0; i <= b->nsets; i++) | 1102 | /* Don't sort if nothing to do */ |
1093 | keys += b->sets[i].data->keys; | 1103 | if (!b->nsets) |
1094 | 1104 | goto out; | |
1095 | total = keys; | ||
1096 | 1105 | ||
1097 | for (j = 0; j < b->nsets; j++) { | 1106 | /* If not a leaf node, always sort */ |
1098 | if (keys * 2 < total || | 1107 | if (b->level) { |
1099 | keys < 1000) { | 1108 | bch_btree_sort(b); |
1100 | bch_btree_sort_partial(b, j); | 1109 | return; |
1101 | return; | 1110 | } |
1102 | } | ||
1103 | 1111 | ||
1104 | keys -= b->sets[j].data->keys; | 1112 | for (i = b->nsets - 1; i >= 0; --i) { |
1105 | } | 1113 | crit *= b->c->sort_crit_factor; |
1106 | 1114 | ||
1107 | /* Must sort if b->nsets == 3 or we'll overflow */ | 1115 | if (b->sets[i].data->keys < crit) { |
1108 | if (b->nsets >= (MAX_BSETS - 1) - b->level) { | 1116 | bch_btree_sort_partial(b, i); |
1109 | bch_btree_sort(b); | ||
1110 | return; | 1117 | return; |
1111 | } | 1118 | } |
1112 | } | 1119 | } |
1113 | 1120 | ||
1121 | /* Sort if we'd overflow */ | ||
1122 | if (b->nsets + 1 == MAX_BSETS) { | ||
1123 | bch_btree_sort(b); | ||
1124 | return; | ||
1125 | } | ||
1126 | |||
1127 | out: | ||
1114 | bset_build_written_tree(b); | 1128 | bset_build_written_tree(b); |
1115 | } | 1129 | } |
1116 | 1130 | ||
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 57a9cff41546..ae115a253d73 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h | |||
@@ -1,6 +1,8 @@ | |||
1 | #ifndef _BCACHE_BSET_H | 1 | #ifndef _BCACHE_BSET_H |
2 | #define _BCACHE_BSET_H | 2 | #define _BCACHE_BSET_H |
3 | 3 | ||
4 | #include <linux/slab.h> | ||
5 | |||
4 | /* | 6 | /* |
5 | * BKEYS: | 7 | * BKEYS: |
6 | * | 8 | * |
@@ -142,6 +144,8 @@ | |||
142 | 144 | ||
143 | /* Btree key comparison/iteration */ | 145 | /* Btree key comparison/iteration */ |
144 | 146 | ||
147 | #define MAX_BSETS 4U | ||
148 | |||
145 | struct btree_iter { | 149 | struct btree_iter { |
146 | size_t size, used; | 150 | size_t size, used; |
147 | struct btree_iter_set { | 151 | struct btree_iter_set { |
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7a5658f04e62..ee372884c405 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include "btree.h" | 24 | #include "btree.h" |
25 | #include "debug.h" | 25 | #include "debug.h" |
26 | #include "request.h" | 26 | #include "request.h" |
27 | #include "writeback.h" | ||
27 | 28 | ||
28 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
29 | #include <linux/bitops.h> | 30 | #include <linux/bitops.h> |
@@ -134,44 +135,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) | |||
134 | return crc ^ 0xffffffffffffffffULL; | 135 | return crc ^ 0xffffffffffffffffULL; |
135 | } | 136 | } |
136 | 137 | ||
137 | static void btree_bio_endio(struct bio *bio, int error) | 138 | static void bch_btree_node_read_done(struct btree *b) |
138 | { | 139 | { |
139 | struct closure *cl = bio->bi_private; | ||
140 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
141 | |||
142 | if (error) | ||
143 | set_btree_node_io_error(b); | ||
144 | |||
145 | bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE) | ||
146 | ? "writing btree" : "reading btree"); | ||
147 | closure_put(cl); | ||
148 | } | ||
149 | |||
150 | static void btree_bio_init(struct btree *b) | ||
151 | { | ||
152 | BUG_ON(b->bio); | ||
153 | b->bio = bch_bbio_alloc(b->c); | ||
154 | |||
155 | b->bio->bi_end_io = btree_bio_endio; | ||
156 | b->bio->bi_private = &b->io.cl; | ||
157 | } | ||
158 | |||
159 | void bch_btree_read_done(struct closure *cl) | ||
160 | { | ||
161 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
162 | struct bset *i = b->sets[0].data; | ||
163 | struct btree_iter *iter = b->c->fill_iter; | ||
164 | const char *err = "bad btree header"; | 140 | const char *err = "bad btree header"; |
165 | BUG_ON(b->nsets || b->written); | 141 | struct bset *i = b->sets[0].data; |
166 | 142 | struct btree_iter *iter; | |
167 | bch_bbio_free(b->bio, b->c); | ||
168 | b->bio = NULL; | ||
169 | 143 | ||
170 | mutex_lock(&b->c->fill_lock); | 144 | iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); |
145 | iter->size = b->c->sb.bucket_size / b->c->sb.block_size; | ||
171 | iter->used = 0; | 146 | iter->used = 0; |
172 | 147 | ||
173 | if (btree_node_io_error(b) || | 148 | if (!i->seq) |
174 | !i->seq) | ||
175 | goto err; | 149 | goto err; |
176 | 150 | ||
177 | for (; | 151 | for (; |
@@ -228,17 +202,8 @@ void bch_btree_read_done(struct closure *cl) | |||
228 | if (b->written < btree_blocks(b)) | 202 | if (b->written < btree_blocks(b)) |
229 | bch_bset_init_next(b); | 203 | bch_bset_init_next(b); |
230 | out: | 204 | out: |
231 | 205 | mempool_free(iter, b->c->fill_iter); | |
232 | mutex_unlock(&b->c->fill_lock); | 206 | return; |
233 | |||
234 | spin_lock(&b->c->btree_read_time_lock); | ||
235 | bch_time_stats_update(&b->c->btree_read_time, b->io_start_time); | ||
236 | spin_unlock(&b->c->btree_read_time_lock); | ||
237 | |||
238 | smp_wmb(); /* read_done is our write lock */ | ||
239 | set_btree_node_read_done(b); | ||
240 | |||
241 | closure_return(cl); | ||
242 | err: | 207 | err: |
243 | set_btree_node_io_error(b); | 208 | set_btree_node_io_error(b); |
244 | bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", | 209 | bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", |
@@ -247,48 +212,69 @@ err: | |||
247 | goto out; | 212 | goto out; |
248 | } | 213 | } |
249 | 214 | ||
250 | void bch_btree_read(struct btree *b) | 215 | static void btree_node_read_endio(struct bio *bio, int error) |
216 | { | ||
217 | struct closure *cl = bio->bi_private; | ||
218 | closure_put(cl); | ||
219 | } | ||
220 | |||
221 | void bch_btree_node_read(struct btree *b) | ||
251 | { | 222 | { |
252 | BUG_ON(b->nsets || b->written); | 223 | uint64_t start_time = local_clock(); |
224 | struct closure cl; | ||
225 | struct bio *bio; | ||
226 | |||
227 | trace_bcache_btree_read(b); | ||
228 | |||
229 | closure_init_stack(&cl); | ||
230 | |||
231 | bio = bch_bbio_alloc(b->c); | ||
232 | bio->bi_rw = REQ_META|READ_SYNC; | ||
233 | bio->bi_size = KEY_SIZE(&b->key) << 9; | ||
234 | bio->bi_end_io = btree_node_read_endio; | ||
235 | bio->bi_private = &cl; | ||
236 | |||
237 | bch_bio_map(bio, b->sets[0].data); | ||
238 | |||
239 | bch_submit_bbio(bio, b->c, &b->key, 0); | ||
240 | closure_sync(&cl); | ||
253 | 241 | ||
254 | if (!closure_trylock(&b->io.cl, &b->c->cl)) | 242 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) |
255 | BUG(); | 243 | set_btree_node_io_error(b); |
256 | 244 | ||
257 | b->io_start_time = local_clock(); | 245 | bch_bbio_free(bio, b->c); |
258 | 246 | ||
259 | btree_bio_init(b); | 247 | if (btree_node_io_error(b)) |
260 | b->bio->bi_rw = REQ_META|READ_SYNC; | 248 | goto err; |
261 | b->bio->bi_size = KEY_SIZE(&b->key) << 9; | ||
262 | 249 | ||
263 | bch_bio_map(b->bio, b->sets[0].data); | 250 | bch_btree_node_read_done(b); |
264 | 251 | ||
265 | pr_debug("%s", pbtree(b)); | 252 | spin_lock(&b->c->btree_read_time_lock); |
266 | trace_bcache_btree_read(b->bio); | 253 | bch_time_stats_update(&b->c->btree_read_time, start_time); |
267 | bch_submit_bbio(b->bio, b->c, &b->key, 0); | 254 | spin_unlock(&b->c->btree_read_time_lock); |
268 | 255 | ||
269 | continue_at(&b->io.cl, bch_btree_read_done, system_wq); | 256 | return; |
257 | err: | ||
258 | bch_cache_set_error(b->c, "io error reading bucket %lu", | ||
259 | PTR_BUCKET_NR(b->c, &b->key, 0)); | ||
270 | } | 260 | } |
271 | 261 | ||
272 | static void btree_complete_write(struct btree *b, struct btree_write *w) | 262 | static void btree_complete_write(struct btree *b, struct btree_write *w) |
273 | { | 263 | { |
274 | if (w->prio_blocked && | 264 | if (w->prio_blocked && |
275 | !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) | 265 | !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) |
276 | wake_up(&b->c->alloc_wait); | 266 | wake_up_allocators(b->c); |
277 | 267 | ||
278 | if (w->journal) { | 268 | if (w->journal) { |
279 | atomic_dec_bug(w->journal); | 269 | atomic_dec_bug(w->journal); |
280 | __closure_wake_up(&b->c->journal.wait); | 270 | __closure_wake_up(&b->c->journal.wait); |
281 | } | 271 | } |
282 | 272 | ||
283 | if (w->owner) | ||
284 | closure_put(w->owner); | ||
285 | |||
286 | w->prio_blocked = 0; | 273 | w->prio_blocked = 0; |
287 | w->journal = NULL; | 274 | w->journal = NULL; |
288 | w->owner = NULL; | ||
289 | } | 275 | } |
290 | 276 | ||
291 | static void __btree_write_done(struct closure *cl) | 277 | static void __btree_node_write_done(struct closure *cl) |
292 | { | 278 | { |
293 | struct btree *b = container_of(cl, struct btree, io.cl); | 279 | struct btree *b = container_of(cl, struct btree, io.cl); |
294 | struct btree_write *w = btree_prev_write(b); | 280 | struct btree_write *w = btree_prev_write(b); |
@@ -304,7 +290,7 @@ static void __btree_write_done(struct closure *cl) | |||
304 | closure_return(cl); | 290 | closure_return(cl); |
305 | } | 291 | } |
306 | 292 | ||
307 | static void btree_write_done(struct closure *cl) | 293 | static void btree_node_write_done(struct closure *cl) |
308 | { | 294 | { |
309 | struct btree *b = container_of(cl, struct btree, io.cl); | 295 | struct btree *b = container_of(cl, struct btree, io.cl); |
310 | struct bio_vec *bv; | 296 | struct bio_vec *bv; |
@@ -313,10 +299,22 @@ static void btree_write_done(struct closure *cl) | |||
313 | __bio_for_each_segment(bv, b->bio, n, 0) | 299 | __bio_for_each_segment(bv, b->bio, n, 0) |
314 | __free_page(bv->bv_page); | 300 | __free_page(bv->bv_page); |
315 | 301 | ||
316 | __btree_write_done(cl); | 302 | __btree_node_write_done(cl); |
317 | } | 303 | } |
318 | 304 | ||
319 | static void do_btree_write(struct btree *b) | 305 | static void btree_node_write_endio(struct bio *bio, int error) |
306 | { | ||
307 | struct closure *cl = bio->bi_private; | ||
308 | struct btree *b = container_of(cl, struct btree, io.cl); | ||
309 | |||
310 | if (error) | ||
311 | set_btree_node_io_error(b); | ||
312 | |||
313 | bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); | ||
314 | closure_put(cl); | ||
315 | } | ||
316 | |||
317 | static void do_btree_node_write(struct btree *b) | ||
320 | { | 318 | { |
321 | struct closure *cl = &b->io.cl; | 319 | struct closure *cl = &b->io.cl; |
322 | struct bset *i = b->sets[b->nsets].data; | 320 | struct bset *i = b->sets[b->nsets].data; |
@@ -325,15 +323,34 @@ static void do_btree_write(struct btree *b) | |||
325 | i->version = BCACHE_BSET_VERSION; | 323 | i->version = BCACHE_BSET_VERSION; |
326 | i->csum = btree_csum_set(b, i); | 324 | i->csum = btree_csum_set(b, i); |
327 | 325 | ||
328 | btree_bio_init(b); | 326 | BUG_ON(b->bio); |
329 | b->bio->bi_rw = REQ_META|WRITE_SYNC; | 327 | b->bio = bch_bbio_alloc(b->c); |
330 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); | 328 | |
329 | b->bio->bi_end_io = btree_node_write_endio; | ||
330 | b->bio->bi_private = &b->io.cl; | ||
331 | b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; | ||
332 | b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); | ||
331 | bch_bio_map(b->bio, i); | 333 | bch_bio_map(b->bio, i); |
332 | 334 | ||
335 | /* | ||
336 | * If we're appending to a leaf node, we don't technically need FUA - | ||
337 | * this write just needs to be persisted before the next journal write, | ||
338 | * which will be marked FLUSH|FUA. | ||
339 | * | ||
340 | * Similarly if we're writing a new btree root - the pointer is going to | ||
341 | * be in the next journal entry. | ||
342 | * | ||
343 | * But if we're writing a new btree node (that isn't a root) or | ||
344 | * appending to a non leaf btree node, we need either FUA or a flush | ||
345 | * when we write the parent with the new pointer. FUA is cheaper than a | ||
346 | * flush, and writes appending to leaf nodes aren't blocking anything so | ||
347 | * just make all btree node writes FUA to keep things sane. | ||
348 | */ | ||
349 | |||
333 | bkey_copy(&k.key, &b->key); | 350 | bkey_copy(&k.key, &b->key); |
334 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); | 351 | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); |
335 | 352 | ||
336 | if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { | 353 | if (!bio_alloc_pages(b->bio, GFP_NOIO)) { |
337 | int j; | 354 | int j; |
338 | struct bio_vec *bv; | 355 | struct bio_vec *bv; |
339 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); | 356 | void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); |
@@ -342,40 +359,41 @@ static void do_btree_write(struct btree *b) | |||
342 | memcpy(page_address(bv->bv_page), | 359 | memcpy(page_address(bv->bv_page), |
343 | base + j * PAGE_SIZE, PAGE_SIZE); | 360 | base + j * PAGE_SIZE, PAGE_SIZE); |
344 | 361 | ||
345 | trace_bcache_btree_write(b->bio); | ||
346 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | 362 | bch_submit_bbio(b->bio, b->c, &k.key, 0); |
347 | 363 | ||
348 | continue_at(cl, btree_write_done, NULL); | 364 | continue_at(cl, btree_node_write_done, NULL); |
349 | } else { | 365 | } else { |
350 | b->bio->bi_vcnt = 0; | 366 | b->bio->bi_vcnt = 0; |
351 | bch_bio_map(b->bio, i); | 367 | bch_bio_map(b->bio, i); |
352 | 368 | ||
353 | trace_bcache_btree_write(b->bio); | ||
354 | bch_submit_bbio(b->bio, b->c, &k.key, 0); | 369 | bch_submit_bbio(b->bio, b->c, &k.key, 0); |
355 | 370 | ||
356 | closure_sync(cl); | 371 | closure_sync(cl); |
357 | __btree_write_done(cl); | 372 | __btree_node_write_done(cl); |
358 | } | 373 | } |
359 | } | 374 | } |
360 | 375 | ||
361 | static void __btree_write(struct btree *b) | 376 | void bch_btree_node_write(struct btree *b, struct closure *parent) |
362 | { | 377 | { |
363 | struct bset *i = b->sets[b->nsets].data; | 378 | struct bset *i = b->sets[b->nsets].data; |
364 | 379 | ||
380 | trace_bcache_btree_write(b); | ||
381 | |||
365 | BUG_ON(current->bio_list); | 382 | BUG_ON(current->bio_list); |
383 | BUG_ON(b->written >= btree_blocks(b)); | ||
384 | BUG_ON(b->written && !i->keys); | ||
385 | BUG_ON(b->sets->data->seq != i->seq); | ||
386 | bch_check_key_order(b, i); | ||
366 | 387 | ||
367 | closure_lock(&b->io, &b->c->cl); | ||
368 | cancel_delayed_work(&b->work); | 388 | cancel_delayed_work(&b->work); |
369 | 389 | ||
390 | /* If caller isn't waiting for write, parent refcount is cache set */ | ||
391 | closure_lock(&b->io, parent ?: &b->c->cl); | ||
392 | |||
370 | clear_bit(BTREE_NODE_dirty, &b->flags); | 393 | clear_bit(BTREE_NODE_dirty, &b->flags); |
371 | change_bit(BTREE_NODE_write_idx, &b->flags); | 394 | change_bit(BTREE_NODE_write_idx, &b->flags); |
372 | 395 | ||
373 | bch_check_key_order(b, i); | 396 | do_btree_node_write(b); |
374 | BUG_ON(b->written && !i->keys); | ||
375 | |||
376 | do_btree_write(b); | ||
377 | |||
378 | pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); | ||
379 | 397 | ||
380 | b->written += set_blocks(i, b->c); | 398 | b->written += set_blocks(i, b->c); |
381 | atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, | 399 | atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, |
@@ -387,37 +405,31 @@ static void __btree_write(struct btree *b) | |||
387 | bch_bset_init_next(b); | 405 | bch_bset_init_next(b); |
388 | } | 406 | } |
389 | 407 | ||
390 | static void btree_write_work(struct work_struct *w) | 408 | static void btree_node_write_work(struct work_struct *w) |
391 | { | 409 | { |
392 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); | 410 | struct btree *b = container_of(to_delayed_work(w), struct btree, work); |
393 | 411 | ||
394 | down_write(&b->lock); | 412 | rw_lock(true, b, b->level); |
395 | 413 | ||
396 | if (btree_node_dirty(b)) | 414 | if (btree_node_dirty(b)) |
397 | __btree_write(b); | 415 | bch_btree_node_write(b, NULL); |
398 | up_write(&b->lock); | 416 | rw_unlock(true, b); |
399 | } | 417 | } |
400 | 418 | ||
401 | void bch_btree_write(struct btree *b, bool now, struct btree_op *op) | 419 | static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) |
402 | { | 420 | { |
403 | struct bset *i = b->sets[b->nsets].data; | 421 | struct bset *i = b->sets[b->nsets].data; |
404 | struct btree_write *w = btree_current_write(b); | 422 | struct btree_write *w = btree_current_write(b); |
405 | 423 | ||
406 | BUG_ON(b->written && | 424 | BUG_ON(!b->written); |
407 | (b->written >= btree_blocks(b) || | 425 | BUG_ON(!i->keys); |
408 | i->seq != b->sets[0].data->seq || | ||
409 | !i->keys)); | ||
410 | 426 | ||
411 | if (!btree_node_dirty(b)) { | 427 | if (!btree_node_dirty(b)) |
412 | set_btree_node_dirty(b); | 428 | queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); |
413 | queue_delayed_work(btree_io_wq, &b->work, | ||
414 | msecs_to_jiffies(30000)); | ||
415 | } | ||
416 | 429 | ||
417 | w->prio_blocked += b->prio_blocked; | 430 | set_btree_node_dirty(b); |
418 | b->prio_blocked = 0; | ||
419 | 431 | ||
420 | if (op && op->journal && !b->level) { | 432 | if (op && op->journal) { |
421 | if (w->journal && | 433 | if (w->journal && |
422 | journal_pin_cmp(b->c, w, op)) { | 434 | journal_pin_cmp(b->c, w, op)) { |
423 | atomic_dec_bug(w->journal); | 435 | atomic_dec_bug(w->journal); |
@@ -430,23 +442,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op) | |||
430 | } | 442 | } |
431 | } | 443 | } |
432 | 444 | ||
433 | if (current->bio_list) | ||
434 | return; | ||
435 | |||
436 | /* Force write if set is too big */ | 445 | /* Force write if set is too big */ |
437 | if (now || | 446 | if (set_bytes(i) > PAGE_SIZE - 48 && |
438 | b->level || | 447 | !current->bio_list) |
439 | set_bytes(i) > PAGE_SIZE - 48) { | 448 | bch_btree_node_write(b, NULL); |
440 | if (op && now) { | ||
441 | /* Must wait on multiple writes */ | ||
442 | BUG_ON(w->owner); | ||
443 | w->owner = &op->cl; | ||
444 | closure_get(&op->cl); | ||
445 | } | ||
446 | |||
447 | __btree_write(b); | ||
448 | } | ||
449 | BUG_ON(!b->written); | ||
450 | } | 449 | } |
451 | 450 | ||
452 | /* | 451 | /* |
@@ -559,7 +558,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, | |||
559 | init_rwsem(&b->lock); | 558 | init_rwsem(&b->lock); |
560 | lockdep_set_novalidate_class(&b->lock); | 559 | lockdep_set_novalidate_class(&b->lock); |
561 | INIT_LIST_HEAD(&b->list); | 560 | INIT_LIST_HEAD(&b->list); |
562 | INIT_DELAYED_WORK(&b->work, btree_write_work); | 561 | INIT_DELAYED_WORK(&b->work, btree_node_write_work); |
563 | b->c = c; | 562 | b->c = c; |
564 | closure_init_unlocked(&b->io); | 563 | closure_init_unlocked(&b->io); |
565 | 564 | ||
@@ -582,7 +581,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) | |||
582 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); | 581 | BUG_ON(btree_node_dirty(b) && !b->sets[0].data); |
583 | 582 | ||
584 | if (cl && btree_node_dirty(b)) | 583 | if (cl && btree_node_dirty(b)) |
585 | bch_btree_write(b, true, NULL); | 584 | bch_btree_node_write(b, NULL); |
586 | 585 | ||
587 | if (cl) | 586 | if (cl) |
588 | closure_wait_event_async(&b->io.wait, cl, | 587 | closure_wait_event_async(&b->io.wait, cl, |
@@ -623,6 +622,13 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) | |||
623 | else if (!mutex_trylock(&c->bucket_lock)) | 622 | else if (!mutex_trylock(&c->bucket_lock)) |
624 | return -1; | 623 | return -1; |
625 | 624 | ||
625 | /* | ||
626 | * It's _really_ critical that we don't free too many btree nodes - we | ||
627 | * have to always leave ourselves a reserve. The reserve is how we | ||
628 | * guarantee that allocating memory for a new btree node can always | ||
629 | * succeed, so that inserting keys into the btree can always succeed and | ||
630 | * IO can always make forward progress: | ||
631 | */ | ||
626 | nr /= c->btree_pages; | 632 | nr /= c->btree_pages; |
627 | nr = min_t(unsigned long, nr, mca_can_free(c)); | 633 | nr = min_t(unsigned long, nr, mca_can_free(c)); |
628 | 634 | ||
@@ -766,6 +772,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, | |||
766 | int ret = -ENOMEM; | 772 | int ret = -ENOMEM; |
767 | struct btree *i; | 773 | struct btree *i; |
768 | 774 | ||
775 | trace_bcache_btree_cache_cannibalize(c); | ||
776 | |||
769 | if (!cl) | 777 | if (!cl) |
770 | return ERR_PTR(-ENOMEM); | 778 | return ERR_PTR(-ENOMEM); |
771 | 779 | ||
@@ -784,7 +792,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, | |||
784 | return ERR_PTR(-EAGAIN); | 792 | return ERR_PTR(-EAGAIN); |
785 | } | 793 | } |
786 | 794 | ||
787 | /* XXX: tracepoint */ | ||
788 | c->try_harder = cl; | 795 | c->try_harder = cl; |
789 | c->try_harder_start = local_clock(); | 796 | c->try_harder_start = local_clock(); |
790 | retry: | 797 | retry: |
@@ -905,6 +912,9 @@ retry: | |||
905 | b = mca_find(c, k); | 912 | b = mca_find(c, k); |
906 | 913 | ||
907 | if (!b) { | 914 | if (!b) { |
915 | if (current->bio_list) | ||
916 | return ERR_PTR(-EAGAIN); | ||
917 | |||
908 | mutex_lock(&c->bucket_lock); | 918 | mutex_lock(&c->bucket_lock); |
909 | b = mca_alloc(c, k, level, &op->cl); | 919 | b = mca_alloc(c, k, level, &op->cl); |
910 | mutex_unlock(&c->bucket_lock); | 920 | mutex_unlock(&c->bucket_lock); |
@@ -914,7 +924,7 @@ retry: | |||
914 | if (IS_ERR(b)) | 924 | if (IS_ERR(b)) |
915 | return b; | 925 | return b; |
916 | 926 | ||
917 | bch_btree_read(b); | 927 | bch_btree_node_read(b); |
918 | 928 | ||
919 | if (!write) | 929 | if (!write) |
920 | downgrade_write(&b->lock); | 930 | downgrade_write(&b->lock); |
@@ -937,15 +947,12 @@ retry: | |||
937 | for (; i <= b->nsets; i++) | 947 | for (; i <= b->nsets; i++) |
938 | prefetch(b->sets[i].data); | 948 | prefetch(b->sets[i].data); |
939 | 949 | ||
940 | if (!closure_wait_event(&b->io.wait, &op->cl, | 950 | if (btree_node_io_error(b)) { |
941 | btree_node_read_done(b))) { | ||
942 | rw_unlock(write, b); | ||
943 | b = ERR_PTR(-EAGAIN); | ||
944 | } else if (btree_node_io_error(b)) { | ||
945 | rw_unlock(write, b); | 951 | rw_unlock(write, b); |
946 | b = ERR_PTR(-EIO); | 952 | return ERR_PTR(-EIO); |
947 | } else | 953 | } |
948 | BUG_ON(!b->written); | 954 | |
955 | BUG_ON(!b->written); | ||
949 | 956 | ||
950 | return b; | 957 | return b; |
951 | } | 958 | } |
@@ -959,7 +966,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) | |||
959 | mutex_unlock(&c->bucket_lock); | 966 | mutex_unlock(&c->bucket_lock); |
960 | 967 | ||
961 | if (!IS_ERR_OR_NULL(b)) { | 968 | if (!IS_ERR_OR_NULL(b)) { |
962 | bch_btree_read(b); | 969 | bch_btree_node_read(b); |
963 | rw_unlock(true, b); | 970 | rw_unlock(true, b); |
964 | } | 971 | } |
965 | } | 972 | } |
@@ -970,24 +977,19 @@ static void btree_node_free(struct btree *b, struct btree_op *op) | |||
970 | { | 977 | { |
971 | unsigned i; | 978 | unsigned i; |
972 | 979 | ||
980 | trace_bcache_btree_node_free(b); | ||
981 | |||
973 | /* | 982 | /* |
974 | * The BUG_ON() in btree_node_get() implies that we must have a write | 983 | * The BUG_ON() in btree_node_get() implies that we must have a write |
975 | * lock on parent to free or even invalidate a node | 984 | * lock on parent to free or even invalidate a node |
976 | */ | 985 | */ |
977 | BUG_ON(op->lock <= b->level); | 986 | BUG_ON(op->lock <= b->level); |
978 | BUG_ON(b == b->c->root); | 987 | BUG_ON(b == b->c->root); |
979 | pr_debug("bucket %s", pbtree(b)); | ||
980 | 988 | ||
981 | if (btree_node_dirty(b)) | 989 | if (btree_node_dirty(b)) |
982 | btree_complete_write(b, btree_current_write(b)); | 990 | btree_complete_write(b, btree_current_write(b)); |
983 | clear_bit(BTREE_NODE_dirty, &b->flags); | 991 | clear_bit(BTREE_NODE_dirty, &b->flags); |
984 | 992 | ||
985 | if (b->prio_blocked && | ||
986 | !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) | ||
987 | wake_up(&b->c->alloc_wait); | ||
988 | |||
989 | b->prio_blocked = 0; | ||
990 | |||
991 | cancel_delayed_work(&b->work); | 993 | cancel_delayed_work(&b->work); |
992 | 994 | ||
993 | mutex_lock(&b->c->bucket_lock); | 995 | mutex_lock(&b->c->bucket_lock); |
@@ -1028,17 +1030,20 @@ retry: | |||
1028 | goto retry; | 1030 | goto retry; |
1029 | } | 1031 | } |
1030 | 1032 | ||
1031 | set_btree_node_read_done(b); | ||
1032 | b->accessed = 1; | 1033 | b->accessed = 1; |
1033 | bch_bset_init_next(b); | 1034 | bch_bset_init_next(b); |
1034 | 1035 | ||
1035 | mutex_unlock(&c->bucket_lock); | 1036 | mutex_unlock(&c->bucket_lock); |
1037 | |||
1038 | trace_bcache_btree_node_alloc(b); | ||
1036 | return b; | 1039 | return b; |
1037 | err_free: | 1040 | err_free: |
1038 | bch_bucket_free(c, &k.key); | 1041 | bch_bucket_free(c, &k.key); |
1039 | __bkey_put(c, &k.key); | 1042 | __bkey_put(c, &k.key); |
1040 | err: | 1043 | err: |
1041 | mutex_unlock(&c->bucket_lock); | 1044 | mutex_unlock(&c->bucket_lock); |
1045 | |||
1046 | trace_bcache_btree_node_alloc_fail(b); | ||
1042 | return b; | 1047 | return b; |
1043 | } | 1048 | } |
1044 | 1049 | ||
@@ -1137,11 +1142,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, | |||
1137 | gc->nkeys++; | 1142 | gc->nkeys++; |
1138 | 1143 | ||
1139 | gc->data += KEY_SIZE(k); | 1144 | gc->data += KEY_SIZE(k); |
1140 | if (KEY_DIRTY(k)) { | 1145 | if (KEY_DIRTY(k)) |
1141 | gc->dirty += KEY_SIZE(k); | 1146 | gc->dirty += KEY_SIZE(k); |
1142 | if (d) | ||
1143 | d->sectors_dirty_gc += KEY_SIZE(k); | ||
1144 | } | ||
1145 | } | 1147 | } |
1146 | 1148 | ||
1147 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) | 1149 | for (t = b->sets; t <= &b->sets[b->nsets]; t++) |
@@ -1166,14 +1168,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, | |||
1166 | 1168 | ||
1167 | if (!IS_ERR_OR_NULL(n)) { | 1169 | if (!IS_ERR_OR_NULL(n)) { |
1168 | swap(b, n); | 1170 | swap(b, n); |
1171 | __bkey_put(b->c, &b->key); | ||
1169 | 1172 | ||
1170 | memcpy(k->ptr, b->key.ptr, | 1173 | memcpy(k->ptr, b->key.ptr, |
1171 | sizeof(uint64_t) * KEY_PTRS(&b->key)); | 1174 | sizeof(uint64_t) * KEY_PTRS(&b->key)); |
1172 | 1175 | ||
1173 | __bkey_put(b->c, &b->key); | ||
1174 | atomic_inc(&b->c->prio_blocked); | ||
1175 | b->prio_blocked++; | ||
1176 | |||
1177 | btree_node_free(n, op); | 1176 | btree_node_free(n, op); |
1178 | up_write(&n->lock); | 1177 | up_write(&n->lock); |
1179 | } | 1178 | } |
@@ -1278,7 +1277,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, | |||
1278 | btree_node_free(r->b, op); | 1277 | btree_node_free(r->b, op); |
1279 | up_write(&r->b->lock); | 1278 | up_write(&r->b->lock); |
1280 | 1279 | ||
1281 | pr_debug("coalesced %u nodes", nodes); | 1280 | trace_bcache_btree_gc_coalesce(nodes); |
1282 | 1281 | ||
1283 | gc->nodes--; | 1282 | gc->nodes--; |
1284 | nodes--; | 1283 | nodes--; |
@@ -1293,14 +1292,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, | |||
1293 | void write(struct btree *r) | 1292 | void write(struct btree *r) |
1294 | { | 1293 | { |
1295 | if (!r->written) | 1294 | if (!r->written) |
1296 | bch_btree_write(r, true, op); | 1295 | bch_btree_node_write(r, &op->cl); |
1297 | else if (btree_node_dirty(r)) { | 1296 | else if (btree_node_dirty(r)) |
1298 | BUG_ON(btree_current_write(r)->owner); | 1297 | bch_btree_node_write(r, writes); |
1299 | btree_current_write(r)->owner = writes; | ||
1300 | closure_get(writes); | ||
1301 | |||
1302 | bch_btree_write(r, true, NULL); | ||
1303 | } | ||
1304 | 1298 | ||
1305 | up_write(&r->lock); | 1299 | up_write(&r->lock); |
1306 | } | 1300 | } |
@@ -1386,9 +1380,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, | |||
1386 | ret = btree_gc_recurse(b, op, writes, gc); | 1380 | ret = btree_gc_recurse(b, op, writes, gc); |
1387 | 1381 | ||
1388 | if (!b->written || btree_node_dirty(b)) { | 1382 | if (!b->written || btree_node_dirty(b)) { |
1389 | atomic_inc(&b->c->prio_blocked); | 1383 | bch_btree_node_write(b, n ? &op->cl : NULL); |
1390 | b->prio_blocked++; | ||
1391 | bch_btree_write(b, true, n ? op : NULL); | ||
1392 | } | 1384 | } |
1393 | 1385 | ||
1394 | if (!IS_ERR_OR_NULL(n)) { | 1386 | if (!IS_ERR_OR_NULL(n)) { |
@@ -1405,7 +1397,6 @@ static void btree_gc_start(struct cache_set *c) | |||
1405 | { | 1397 | { |
1406 | struct cache *ca; | 1398 | struct cache *ca; |
1407 | struct bucket *b; | 1399 | struct bucket *b; |
1408 | struct bcache_device **d; | ||
1409 | unsigned i; | 1400 | unsigned i; |
1410 | 1401 | ||
1411 | if (!c->gc_mark_valid) | 1402 | if (!c->gc_mark_valid) |
@@ -1419,16 +1410,12 @@ static void btree_gc_start(struct cache_set *c) | |||
1419 | for_each_cache(ca, c, i) | 1410 | for_each_cache(ca, c, i) |
1420 | for_each_bucket(b, ca) { | 1411 | for_each_bucket(b, ca) { |
1421 | b->gc_gen = b->gen; | 1412 | b->gc_gen = b->gen; |
1422 | if (!atomic_read(&b->pin)) | 1413 | if (!atomic_read(&b->pin)) { |
1423 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | 1414 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); |
1415 | SET_GC_SECTORS_USED(b, 0); | ||
1416 | } | ||
1424 | } | 1417 | } |
1425 | 1418 | ||
1426 | for (d = c->devices; | ||
1427 | d < c->devices + c->nr_uuids; | ||
1428 | d++) | ||
1429 | if (*d) | ||
1430 | (*d)->sectors_dirty_gc = 0; | ||
1431 | |||
1432 | mutex_unlock(&c->bucket_lock); | 1419 | mutex_unlock(&c->bucket_lock); |
1433 | } | 1420 | } |
1434 | 1421 | ||
@@ -1437,7 +1424,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) | |||
1437 | size_t available = 0; | 1424 | size_t available = 0; |
1438 | struct bucket *b; | 1425 | struct bucket *b; |
1439 | struct cache *ca; | 1426 | struct cache *ca; |
1440 | struct bcache_device **d; | ||
1441 | unsigned i; | 1427 | unsigned i; |
1442 | 1428 | ||
1443 | mutex_lock(&c->bucket_lock); | 1429 | mutex_lock(&c->bucket_lock); |
@@ -1480,22 +1466,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) | |||
1480 | } | 1466 | } |
1481 | } | 1467 | } |
1482 | 1468 | ||
1483 | for (d = c->devices; | ||
1484 | d < c->devices + c->nr_uuids; | ||
1485 | d++) | ||
1486 | if (*d) { | ||
1487 | unsigned long last = | ||
1488 | atomic_long_read(&((*d)->sectors_dirty)); | ||
1489 | long difference = (*d)->sectors_dirty_gc - last; | ||
1490 | |||
1491 | pr_debug("sectors dirty off by %li", difference); | ||
1492 | |||
1493 | (*d)->sectors_dirty_last += difference; | ||
1494 | |||
1495 | atomic_long_set(&((*d)->sectors_dirty), | ||
1496 | (*d)->sectors_dirty_gc); | ||
1497 | } | ||
1498 | |||
1499 | mutex_unlock(&c->bucket_lock); | 1469 | mutex_unlock(&c->bucket_lock); |
1500 | return available; | 1470 | return available; |
1501 | } | 1471 | } |
@@ -1508,10 +1478,9 @@ static void bch_btree_gc(struct closure *cl) | |||
1508 | struct gc_stat stats; | 1478 | struct gc_stat stats; |
1509 | struct closure writes; | 1479 | struct closure writes; |
1510 | struct btree_op op; | 1480 | struct btree_op op; |
1511 | |||
1512 | uint64_t start_time = local_clock(); | 1481 | uint64_t start_time = local_clock(); |
1513 | trace_bcache_gc_start(c->sb.set_uuid); | 1482 | |
1514 | blktrace_msg_all(c, "Starting gc"); | 1483 | trace_bcache_gc_start(c); |
1515 | 1484 | ||
1516 | memset(&stats, 0, sizeof(struct gc_stat)); | 1485 | memset(&stats, 0, sizeof(struct gc_stat)); |
1517 | closure_init_stack(&writes); | 1486 | closure_init_stack(&writes); |
@@ -1520,14 +1489,14 @@ static void bch_btree_gc(struct closure *cl) | |||
1520 | 1489 | ||
1521 | btree_gc_start(c); | 1490 | btree_gc_start(c); |
1522 | 1491 | ||
1492 | atomic_inc(&c->prio_blocked); | ||
1493 | |||
1523 | ret = btree_root(gc_root, c, &op, &writes, &stats); | 1494 | ret = btree_root(gc_root, c, &op, &writes, &stats); |
1524 | closure_sync(&op.cl); | 1495 | closure_sync(&op.cl); |
1525 | closure_sync(&writes); | 1496 | closure_sync(&writes); |
1526 | 1497 | ||
1527 | if (ret) { | 1498 | if (ret) { |
1528 | blktrace_msg_all(c, "Stopped gc"); | ||
1529 | pr_warn("gc failed!"); | 1499 | pr_warn("gc failed!"); |
1530 | |||
1531 | continue_at(cl, bch_btree_gc, bch_gc_wq); | 1500 | continue_at(cl, bch_btree_gc, bch_gc_wq); |
1532 | } | 1501 | } |
1533 | 1502 | ||
@@ -1537,6 +1506,9 @@ static void bch_btree_gc(struct closure *cl) | |||
1537 | 1506 | ||
1538 | available = bch_btree_gc_finish(c); | 1507 | available = bch_btree_gc_finish(c); |
1539 | 1508 | ||
1509 | atomic_dec(&c->prio_blocked); | ||
1510 | wake_up_allocators(c); | ||
1511 | |||
1540 | bch_time_stats_update(&c->btree_gc_time, start_time); | 1512 | bch_time_stats_update(&c->btree_gc_time, start_time); |
1541 | 1513 | ||
1542 | stats.key_bytes *= sizeof(uint64_t); | 1514 | stats.key_bytes *= sizeof(uint64_t); |
@@ -1544,10 +1516,8 @@ static void bch_btree_gc(struct closure *cl) | |||
1544 | stats.data <<= 9; | 1516 | stats.data <<= 9; |
1545 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; | 1517 | stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; |
1546 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); | 1518 | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); |
1547 | blktrace_msg_all(c, "Finished gc"); | ||
1548 | 1519 | ||
1549 | trace_bcache_gc_end(c->sb.set_uuid); | 1520 | trace_bcache_gc_end(c); |
1550 | wake_up(&c->alloc_wait); | ||
1551 | 1521 | ||
1552 | continue_at(cl, bch_moving_gc, bch_gc_wq); | 1522 | continue_at(cl, bch_moving_gc, bch_gc_wq); |
1553 | } | 1523 | } |
@@ -1654,14 +1624,14 @@ static bool fix_overlapping_extents(struct btree *b, | |||
1654 | struct btree_iter *iter, | 1624 | struct btree_iter *iter, |
1655 | struct btree_op *op) | 1625 | struct btree_op *op) |
1656 | { | 1626 | { |
1657 | void subtract_dirty(struct bkey *k, int sectors) | 1627 | void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) |
1658 | { | 1628 | { |
1659 | struct bcache_device *d = b->c->devices[KEY_INODE(k)]; | 1629 | if (KEY_DIRTY(k)) |
1660 | 1630 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), | |
1661 | if (KEY_DIRTY(k) && d) | 1631 | offset, -sectors); |
1662 | atomic_long_sub(sectors, &d->sectors_dirty); | ||
1663 | } | 1632 | } |
1664 | 1633 | ||
1634 | uint64_t old_offset; | ||
1665 | unsigned old_size, sectors_found = 0; | 1635 | unsigned old_size, sectors_found = 0; |
1666 | 1636 | ||
1667 | while (1) { | 1637 | while (1) { |
@@ -1673,6 +1643,7 @@ static bool fix_overlapping_extents(struct btree *b, | |||
1673 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) | 1643 | if (bkey_cmp(k, &START_KEY(insert)) <= 0) |
1674 | continue; | 1644 | continue; |
1675 | 1645 | ||
1646 | old_offset = KEY_START(k); | ||
1676 | old_size = KEY_SIZE(k); | 1647 | old_size = KEY_SIZE(k); |
1677 | 1648 | ||
1678 | /* | 1649 | /* |
@@ -1728,7 +1699,7 @@ static bool fix_overlapping_extents(struct btree *b, | |||
1728 | 1699 | ||
1729 | struct bkey *top; | 1700 | struct bkey *top; |
1730 | 1701 | ||
1731 | subtract_dirty(k, KEY_SIZE(insert)); | 1702 | subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); |
1732 | 1703 | ||
1733 | if (bkey_written(b, k)) { | 1704 | if (bkey_written(b, k)) { |
1734 | /* | 1705 | /* |
@@ -1775,7 +1746,7 @@ static bool fix_overlapping_extents(struct btree *b, | |||
1775 | } | 1746 | } |
1776 | } | 1747 | } |
1777 | 1748 | ||
1778 | subtract_dirty(k, old_size - KEY_SIZE(k)); | 1749 | subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); |
1779 | } | 1750 | } |
1780 | 1751 | ||
1781 | check_failed: | 1752 | check_failed: |
@@ -1798,7 +1769,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, | |||
1798 | { | 1769 | { |
1799 | struct bset *i = b->sets[b->nsets].data; | 1770 | struct bset *i = b->sets[b->nsets].data; |
1800 | struct bkey *m, *prev; | 1771 | struct bkey *m, *prev; |
1801 | const char *status = "insert"; | 1772 | unsigned status = BTREE_INSERT_STATUS_INSERT; |
1802 | 1773 | ||
1803 | BUG_ON(bkey_cmp(k, &b->key) > 0); | 1774 | BUG_ON(bkey_cmp(k, &b->key) > 0); |
1804 | BUG_ON(b->level && !KEY_PTRS(k)); | 1775 | BUG_ON(b->level && !KEY_PTRS(k)); |
@@ -1831,17 +1802,17 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, | |||
1831 | goto insert; | 1802 | goto insert; |
1832 | 1803 | ||
1833 | /* prev is in the tree, if we merge we're done */ | 1804 | /* prev is in the tree, if we merge we're done */ |
1834 | status = "back merging"; | 1805 | status = BTREE_INSERT_STATUS_BACK_MERGE; |
1835 | if (prev && | 1806 | if (prev && |
1836 | bch_bkey_try_merge(b, prev, k)) | 1807 | bch_bkey_try_merge(b, prev, k)) |
1837 | goto merged; | 1808 | goto merged; |
1838 | 1809 | ||
1839 | status = "overwrote front"; | 1810 | status = BTREE_INSERT_STATUS_OVERWROTE; |
1840 | if (m != end(i) && | 1811 | if (m != end(i) && |
1841 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) | 1812 | KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) |
1842 | goto copy; | 1813 | goto copy; |
1843 | 1814 | ||
1844 | status = "front merge"; | 1815 | status = BTREE_INSERT_STATUS_FRONT_MERGE; |
1845 | if (m != end(i) && | 1816 | if (m != end(i) && |
1846 | bch_bkey_try_merge(b, k, m)) | 1817 | bch_bkey_try_merge(b, k, m)) |
1847 | goto copy; | 1818 | goto copy; |
@@ -1851,21 +1822,21 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, | |||
1851 | insert: shift_keys(b, m, k); | 1822 | insert: shift_keys(b, m, k); |
1852 | copy: bkey_copy(m, k); | 1823 | copy: bkey_copy(m, k); |
1853 | merged: | 1824 | merged: |
1854 | bch_check_keys(b, "%s for %s at %s: %s", status, | 1825 | if (KEY_DIRTY(k)) |
1855 | op_type(op), pbtree(b), pkey(k)); | 1826 | bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), |
1856 | bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status, | 1827 | KEY_START(k), KEY_SIZE(k)); |
1857 | op_type(op), pbtree(b), pkey(k)); | 1828 | |
1829 | bch_check_keys(b, "%u for %s", status, op_type(op)); | ||
1858 | 1830 | ||
1859 | if (b->level && !KEY_OFFSET(k)) | 1831 | if (b->level && !KEY_OFFSET(k)) |
1860 | b->prio_blocked++; | 1832 | btree_current_write(b)->prio_blocked++; |
1861 | 1833 | ||
1862 | pr_debug("%s for %s at %s: %s", status, | 1834 | trace_bcache_btree_insert_key(b, k, op->type, status); |
1863 | op_type(op), pbtree(b), pkey(k)); | ||
1864 | 1835 | ||
1865 | return true; | 1836 | return true; |
1866 | } | 1837 | } |
1867 | 1838 | ||
1868 | bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) | 1839 | static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) |
1869 | { | 1840 | { |
1870 | bool ret = false; | 1841 | bool ret = false; |
1871 | struct bkey *k; | 1842 | struct bkey *k; |
@@ -1896,7 +1867,7 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, | |||
1896 | should_split(b)) | 1867 | should_split(b)) |
1897 | goto out; | 1868 | goto out; |
1898 | 1869 | ||
1899 | op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); | 1870 | op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); |
1900 | 1871 | ||
1901 | SET_KEY_PTRS(&op->replace, 1); | 1872 | SET_KEY_PTRS(&op->replace, 1); |
1902 | get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); | 1873 | get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); |
@@ -1907,7 +1878,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, | |||
1907 | 1878 | ||
1908 | BUG_ON(op->type != BTREE_INSERT); | 1879 | BUG_ON(op->type != BTREE_INSERT); |
1909 | BUG_ON(!btree_insert_key(b, op, &tmp.k)); | 1880 | BUG_ON(!btree_insert_key(b, op, &tmp.k)); |
1910 | bch_btree_write(b, false, NULL); | ||
1911 | ret = true; | 1881 | ret = true; |
1912 | out: | 1882 | out: |
1913 | downgrade_write(&b->lock); | 1883 | downgrade_write(&b->lock); |
@@ -1929,12 +1899,11 @@ static int btree_split(struct btree *b, struct btree_op *op) | |||
1929 | 1899 | ||
1930 | split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; | 1900 | split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; |
1931 | 1901 | ||
1932 | pr_debug("%ssplitting at %s keys %i", split ? "" : "not ", | ||
1933 | pbtree(b), n1->sets[0].data->keys); | ||
1934 | |||
1935 | if (split) { | 1902 | if (split) { |
1936 | unsigned keys = 0; | 1903 | unsigned keys = 0; |
1937 | 1904 | ||
1905 | trace_bcache_btree_node_split(b, n1->sets[0].data->keys); | ||
1906 | |||
1938 | n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); | 1907 | n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); |
1939 | if (IS_ERR(n2)) | 1908 | if (IS_ERR(n2)) |
1940 | goto err_free1; | 1909 | goto err_free1; |
@@ -1967,18 +1936,21 @@ static int btree_split(struct btree *b, struct btree_op *op) | |||
1967 | bkey_copy_key(&n2->key, &b->key); | 1936 | bkey_copy_key(&n2->key, &b->key); |
1968 | 1937 | ||
1969 | bch_keylist_add(&op->keys, &n2->key); | 1938 | bch_keylist_add(&op->keys, &n2->key); |
1970 | bch_btree_write(n2, true, op); | 1939 | bch_btree_node_write(n2, &op->cl); |
1971 | rw_unlock(true, n2); | 1940 | rw_unlock(true, n2); |
1972 | } else | 1941 | } else { |
1942 | trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); | ||
1943 | |||
1973 | bch_btree_insert_keys(n1, op); | 1944 | bch_btree_insert_keys(n1, op); |
1945 | } | ||
1974 | 1946 | ||
1975 | bch_keylist_add(&op->keys, &n1->key); | 1947 | bch_keylist_add(&op->keys, &n1->key); |
1976 | bch_btree_write(n1, true, op); | 1948 | bch_btree_node_write(n1, &op->cl); |
1977 | 1949 | ||
1978 | if (n3) { | 1950 | if (n3) { |
1979 | bkey_copy_key(&n3->key, &MAX_KEY); | 1951 | bkey_copy_key(&n3->key, &MAX_KEY); |
1980 | bch_btree_insert_keys(n3, op); | 1952 | bch_btree_insert_keys(n3, op); |
1981 | bch_btree_write(n3, true, op); | 1953 | bch_btree_node_write(n3, &op->cl); |
1982 | 1954 | ||
1983 | closure_sync(&op->cl); | 1955 | closure_sync(&op->cl); |
1984 | bch_btree_set_root(n3); | 1956 | bch_btree_set_root(n3); |
@@ -2082,8 +2054,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, | |||
2082 | 2054 | ||
2083 | BUG_ON(write_block(b) != b->sets[b->nsets].data); | 2055 | BUG_ON(write_block(b) != b->sets[b->nsets].data); |
2084 | 2056 | ||
2085 | if (bch_btree_insert_keys(b, op)) | 2057 | if (bch_btree_insert_keys(b, op)) { |
2086 | bch_btree_write(b, false, op); | 2058 | if (!b->level) |
2059 | bch_btree_leaf_dirty(b, op); | ||
2060 | else | ||
2061 | bch_btree_node_write(b, &op->cl); | ||
2062 | } | ||
2087 | } | 2063 | } |
2088 | 2064 | ||
2089 | return 0; | 2065 | return 0; |
@@ -2140,6 +2116,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c) | |||
2140 | void bch_btree_set_root(struct btree *b) | 2116 | void bch_btree_set_root(struct btree *b) |
2141 | { | 2117 | { |
2142 | unsigned i; | 2118 | unsigned i; |
2119 | struct closure cl; | ||
2120 | |||
2121 | closure_init_stack(&cl); | ||
2122 | |||
2123 | trace_bcache_btree_set_root(b); | ||
2143 | 2124 | ||
2144 | BUG_ON(!b->written); | 2125 | BUG_ON(!b->written); |
2145 | 2126 | ||
@@ -2153,8 +2134,8 @@ void bch_btree_set_root(struct btree *b) | |||
2153 | b->c->root = b; | 2134 | b->c->root = b; |
2154 | __bkey_put(b->c, &b->key); | 2135 | __bkey_put(b->c, &b->key); |
2155 | 2136 | ||
2156 | bch_journal_meta(b->c, NULL); | 2137 | bch_journal_meta(b->c, &cl); |
2157 | pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); | 2138 | closure_sync(&cl); |
2158 | } | 2139 | } |
2159 | 2140 | ||
2160 | /* Cache lookup */ | 2141 | /* Cache lookup */ |
@@ -2215,9 +2196,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, | |||
2215 | KEY_OFFSET(k) - bio->bi_sector); | 2196 | KEY_OFFSET(k) - bio->bi_sector); |
2216 | 2197 | ||
2217 | n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | 2198 | n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); |
2218 | if (!n) | ||
2219 | return -EAGAIN; | ||
2220 | |||
2221 | if (n == bio) | 2199 | if (n == bio) |
2222 | op->lookup_done = true; | 2200 | op->lookup_done = true; |
2223 | 2201 | ||
@@ -2240,7 +2218,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, | |||
2240 | n->bi_end_io = bch_cache_read_endio; | 2218 | n->bi_end_io = bch_cache_read_endio; |
2241 | n->bi_private = &s->cl; | 2219 | n->bi_private = &s->cl; |
2242 | 2220 | ||
2243 | trace_bcache_cache_hit(n); | ||
2244 | __bch_submit_bbio(n, b->c); | 2221 | __bch_submit_bbio(n, b->c); |
2245 | } | 2222 | } |
2246 | 2223 | ||
@@ -2257,9 +2234,6 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op) | |||
2257 | struct btree_iter iter; | 2234 | struct btree_iter iter; |
2258 | bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); | 2235 | bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); |
2259 | 2236 | ||
2260 | pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode, | ||
2261 | (uint64_t) bio->bi_sector); | ||
2262 | |||
2263 | do { | 2237 | do { |
2264 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); | 2238 | k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); |
2265 | if (!k) { | 2239 | if (!k) { |
@@ -2303,7 +2277,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, | |||
2303 | } | 2277 | } |
2304 | 2278 | ||
2305 | static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | 2279 | static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, |
2306 | struct keybuf *buf, struct bkey *end) | 2280 | struct keybuf *buf, struct bkey *end, |
2281 | keybuf_pred_fn *pred) | ||
2307 | { | 2282 | { |
2308 | struct btree_iter iter; | 2283 | struct btree_iter iter; |
2309 | bch_btree_iter_init(b, &iter, &buf->last_scanned); | 2284 | bch_btree_iter_init(b, &iter, &buf->last_scanned); |
@@ -2322,11 +2297,9 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | |||
2322 | if (bkey_cmp(&buf->last_scanned, end) >= 0) | 2297 | if (bkey_cmp(&buf->last_scanned, end) >= 0) |
2323 | break; | 2298 | break; |
2324 | 2299 | ||
2325 | if (buf->key_predicate(buf, k)) { | 2300 | if (pred(buf, k)) { |
2326 | struct keybuf_key *w; | 2301 | struct keybuf_key *w; |
2327 | 2302 | ||
2328 | pr_debug("%s", pkey(k)); | ||
2329 | |||
2330 | spin_lock(&buf->lock); | 2303 | spin_lock(&buf->lock); |
2331 | 2304 | ||
2332 | w = array_alloc(&buf->freelist); | 2305 | w = array_alloc(&buf->freelist); |
@@ -2343,7 +2316,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | |||
2343 | if (!k) | 2316 | if (!k) |
2344 | break; | 2317 | break; |
2345 | 2318 | ||
2346 | btree(refill_keybuf, k, b, op, buf, end); | 2319 | btree(refill_keybuf, k, b, op, buf, end, pred); |
2347 | /* | 2320 | /* |
2348 | * Might get an error here, but can't really do anything | 2321 | * Might get an error here, but can't really do anything |
2349 | * and it'll get logged elsewhere. Just read what we | 2322 | * and it'll get logged elsewhere. Just read what we |
@@ -2361,7 +2334,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, | |||
2361 | } | 2334 | } |
2362 | 2335 | ||
2363 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, | 2336 | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, |
2364 | struct bkey *end) | 2337 | struct bkey *end, keybuf_pred_fn *pred) |
2365 | { | 2338 | { |
2366 | struct bkey start = buf->last_scanned; | 2339 | struct bkey start = buf->last_scanned; |
2367 | struct btree_op op; | 2340 | struct btree_op op; |
@@ -2369,7 +2342,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, | |||
2369 | 2342 | ||
2370 | cond_resched(); | 2343 | cond_resched(); |
2371 | 2344 | ||
2372 | btree_root(refill_keybuf, c, &op, buf, end); | 2345 | btree_root(refill_keybuf, c, &op, buf, end, pred); |
2373 | closure_sync(&op.cl); | 2346 | closure_sync(&op.cl); |
2374 | 2347 | ||
2375 | pr_debug("found %s keys from %llu:%llu to %llu:%llu", | 2348 | pr_debug("found %s keys from %llu:%llu to %llu:%llu", |
@@ -2455,7 +2428,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf) | |||
2455 | 2428 | ||
2456 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, | 2429 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, |
2457 | struct keybuf *buf, | 2430 | struct keybuf *buf, |
2458 | struct bkey *end) | 2431 | struct bkey *end, |
2432 | keybuf_pred_fn *pred) | ||
2459 | { | 2433 | { |
2460 | struct keybuf_key *ret; | 2434 | struct keybuf_key *ret; |
2461 | 2435 | ||
@@ -2469,15 +2443,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, | |||
2469 | break; | 2443 | break; |
2470 | } | 2444 | } |
2471 | 2445 | ||
2472 | bch_refill_keybuf(c, buf, end); | 2446 | bch_refill_keybuf(c, buf, end, pred); |
2473 | } | 2447 | } |
2474 | 2448 | ||
2475 | return ret; | 2449 | return ret; |
2476 | } | 2450 | } |
2477 | 2451 | ||
2478 | void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) | 2452 | void bch_keybuf_init(struct keybuf *buf) |
2479 | { | 2453 | { |
2480 | buf->key_predicate = fn; | ||
2481 | buf->last_scanned = MAX_KEY; | 2454 | buf->last_scanned = MAX_KEY; |
2482 | buf->keys = RB_ROOT; | 2455 | buf->keys = RB_ROOT; |
2483 | 2456 | ||
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index af4a7092a28c..3333d3723633 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h | |||
@@ -102,7 +102,6 @@ | |||
102 | #include "debug.h" | 102 | #include "debug.h" |
103 | 103 | ||
104 | struct btree_write { | 104 | struct btree_write { |
105 | struct closure *owner; | ||
106 | atomic_t *journal; | 105 | atomic_t *journal; |
107 | 106 | ||
108 | /* If btree_split() frees a btree node, it writes a new pointer to that | 107 | /* If btree_split() frees a btree node, it writes a new pointer to that |
@@ -142,16 +141,12 @@ struct btree { | |||
142 | */ | 141 | */ |
143 | struct bset_tree sets[MAX_BSETS]; | 142 | struct bset_tree sets[MAX_BSETS]; |
144 | 143 | ||
145 | /* Used to refcount bio splits, also protects b->bio */ | 144 | /* For outstanding btree writes, used as a lock - protects write_idx */ |
146 | struct closure_with_waitlist io; | 145 | struct closure_with_waitlist io; |
147 | 146 | ||
148 | /* Gets transferred to w->prio_blocked - see the comment there */ | ||
149 | int prio_blocked; | ||
150 | |||
151 | struct list_head list; | 147 | struct list_head list; |
152 | struct delayed_work work; | 148 | struct delayed_work work; |
153 | 149 | ||
154 | uint64_t io_start_time; | ||
155 | struct btree_write writes[2]; | 150 | struct btree_write writes[2]; |
156 | struct bio *bio; | 151 | struct bio *bio; |
157 | }; | 152 | }; |
@@ -164,13 +159,11 @@ static inline void set_btree_node_ ## flag(struct btree *b) \ | |||
164 | { set_bit(BTREE_NODE_ ## flag, &b->flags); } \ | 159 | { set_bit(BTREE_NODE_ ## flag, &b->flags); } \ |
165 | 160 | ||
166 | enum btree_flags { | 161 | enum btree_flags { |
167 | BTREE_NODE_read_done, | ||
168 | BTREE_NODE_io_error, | 162 | BTREE_NODE_io_error, |
169 | BTREE_NODE_dirty, | 163 | BTREE_NODE_dirty, |
170 | BTREE_NODE_write_idx, | 164 | BTREE_NODE_write_idx, |
171 | }; | 165 | }; |
172 | 166 | ||
173 | BTREE_FLAG(read_done); | ||
174 | BTREE_FLAG(io_error); | 167 | BTREE_FLAG(io_error); |
175 | BTREE_FLAG(dirty); | 168 | BTREE_FLAG(dirty); |
176 | BTREE_FLAG(write_idx); | 169 | BTREE_FLAG(write_idx); |
@@ -278,6 +271,13 @@ struct btree_op { | |||
278 | BKEY_PADDED(replace); | 271 | BKEY_PADDED(replace); |
279 | }; | 272 | }; |
280 | 273 | ||
274 | enum { | ||
275 | BTREE_INSERT_STATUS_INSERT, | ||
276 | BTREE_INSERT_STATUS_BACK_MERGE, | ||
277 | BTREE_INSERT_STATUS_OVERWROTE, | ||
278 | BTREE_INSERT_STATUS_FRONT_MERGE, | ||
279 | }; | ||
280 | |||
281 | void bch_btree_op_init_stack(struct btree_op *); | 281 | void bch_btree_op_init_stack(struct btree_op *); |
282 | 282 | ||
283 | static inline void rw_lock(bool w, struct btree *b, int level) | 283 | static inline void rw_lock(bool w, struct btree *b, int level) |
@@ -293,9 +293,7 @@ static inline void rw_unlock(bool w, struct btree *b) | |||
293 | #ifdef CONFIG_BCACHE_EDEBUG | 293 | #ifdef CONFIG_BCACHE_EDEBUG |
294 | unsigned i; | 294 | unsigned i; |
295 | 295 | ||
296 | if (w && | 296 | if (w && b->key.ptr[0]) |
297 | b->key.ptr[0] && | ||
298 | btree_node_read_done(b)) | ||
299 | for (i = 0; i <= b->nsets; i++) | 297 | for (i = 0; i <= b->nsets; i++) |
300 | bch_check_key_order(b, b->sets[i].data); | 298 | bch_check_key_order(b, b->sets[i].data); |
301 | #endif | 299 | #endif |
@@ -370,9 +368,8 @@ static inline bool should_split(struct btree *b) | |||
370 | > btree_blocks(b)); | 368 | > btree_blocks(b)); |
371 | } | 369 | } |
372 | 370 | ||
373 | void bch_btree_read_done(struct closure *); | 371 | void bch_btree_node_read(struct btree *); |
374 | void bch_btree_read(struct btree *); | 372 | void bch_btree_node_write(struct btree *, struct closure *); |
375 | void bch_btree_write(struct btree *b, bool now, struct btree_op *op); | ||
376 | 373 | ||
377 | void bch_cannibalize_unlock(struct cache_set *, struct closure *); | 374 | void bch_cannibalize_unlock(struct cache_set *, struct closure *); |
378 | void bch_btree_set_root(struct btree *); | 375 | void bch_btree_set_root(struct btree *); |
@@ -380,7 +377,6 @@ struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); | |||
380 | struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, | 377 | struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, |
381 | int, struct btree_op *); | 378 | int, struct btree_op *); |
382 | 379 | ||
383 | bool bch_btree_insert_keys(struct btree *, struct btree_op *); | ||
384 | bool bch_btree_insert_check_key(struct btree *, struct btree_op *, | 380 | bool bch_btree_insert_check_key(struct btree *, struct btree_op *, |
385 | struct bio *); | 381 | struct bio *); |
386 | int bch_btree_insert(struct btree_op *, struct cache_set *); | 382 | int bch_btree_insert(struct btree_op *, struct cache_set *); |
@@ -393,13 +389,14 @@ void bch_moving_gc(struct closure *); | |||
393 | int bch_btree_check(struct cache_set *, struct btree_op *); | 389 | int bch_btree_check(struct cache_set *, struct btree_op *); |
394 | uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); | 390 | uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); |
395 | 391 | ||
396 | void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *); | 392 | void bch_keybuf_init(struct keybuf *); |
397 | void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *); | 393 | void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, |
394 | keybuf_pred_fn *); | ||
398 | bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, | 395 | bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, |
399 | struct bkey *); | 396 | struct bkey *); |
400 | void bch_keybuf_del(struct keybuf *, struct keybuf_key *); | 397 | void bch_keybuf_del(struct keybuf *, struct keybuf_key *); |
401 | struct keybuf_key *bch_keybuf_next(struct keybuf *); | 398 | struct keybuf_key *bch_keybuf_next(struct keybuf *); |
402 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, | 399 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, |
403 | struct keybuf *, struct bkey *); | 400 | struct bkey *, keybuf_pred_fn *); |
404 | 401 | ||
405 | #endif | 402 | #endif |
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index bd05a9a8c7cf..9aba2017f0d1 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c | |||
@@ -66,16 +66,18 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) | |||
66 | } else { | 66 | } else { |
67 | struct closure *parent = cl->parent; | 67 | struct closure *parent = cl->parent; |
68 | struct closure_waitlist *wait = closure_waitlist(cl); | 68 | struct closure_waitlist *wait = closure_waitlist(cl); |
69 | closure_fn *destructor = cl->fn; | ||
69 | 70 | ||
70 | closure_debug_destroy(cl); | 71 | closure_debug_destroy(cl); |
71 | 72 | ||
73 | smp_mb(); | ||
72 | atomic_set(&cl->remaining, -1); | 74 | atomic_set(&cl->remaining, -1); |
73 | 75 | ||
74 | if (wait) | 76 | if (wait) |
75 | closure_wake_up(wait); | 77 | closure_wake_up(wait); |
76 | 78 | ||
77 | if (cl->fn) | 79 | if (destructor) |
78 | cl->fn(cl); | 80 | destructor(cl); |
79 | 81 | ||
80 | if (parent) | 82 | if (parent) |
81 | closure_put(parent); | 83 | closure_put(parent); |
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 89fd5204924e..88e6411eab4f 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c | |||
@@ -47,11 +47,10 @@ const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) | |||
47 | return ""; | 47 | return ""; |
48 | } | 48 | } |
49 | 49 | ||
50 | struct keyprint_hack bch_pkey(const struct bkey *k) | 50 | int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) |
51 | { | 51 | { |
52 | unsigned i = 0; | 52 | unsigned i = 0; |
53 | struct keyprint_hack r; | 53 | char *out = buf, *end = buf + size; |
54 | char *out = r.s, *end = r.s + KEYHACK_SIZE; | ||
55 | 54 | ||
56 | #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) | 55 | #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) |
57 | 56 | ||
@@ -75,16 +74,14 @@ struct keyprint_hack bch_pkey(const struct bkey *k) | |||
75 | if (KEY_CSUM(k)) | 74 | if (KEY_CSUM(k)) |
76 | p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); | 75 | p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); |
77 | #undef p | 76 | #undef p |
78 | return r; | 77 | return out - buf; |
79 | } | 78 | } |
80 | 79 | ||
81 | struct keyprint_hack bch_pbtree(const struct btree *b) | 80 | int bch_btree_to_text(char *buf, size_t size, const struct btree *b) |
82 | { | 81 | { |
83 | struct keyprint_hack r; | 82 | return scnprintf(buf, size, "%zu level %i/%i", |
84 | 83 | PTR_BUCKET_NR(b->c, &b->key, 0), | |
85 | snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0), | 84 | b->level, b->c->root ? b->c->root->level : -1); |
86 | b->level, b->c->root ? b->c->root->level : -1); | ||
87 | return r; | ||
88 | } | 85 | } |
89 | 86 | ||
90 | #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) | 87 | #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) |
@@ -100,10 +97,12 @@ static void dump_bset(struct btree *b, struct bset *i) | |||
100 | { | 97 | { |
101 | struct bkey *k; | 98 | struct bkey *k; |
102 | unsigned j; | 99 | unsigned j; |
100 | char buf[80]; | ||
103 | 101 | ||
104 | for (k = i->start; k < end(i); k = bkey_next(k)) { | 102 | for (k = i->start; k < end(i); k = bkey_next(k)) { |
103 | bch_bkey_to_text(buf, sizeof(buf), k); | ||
105 | printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), | 104 | printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), |
106 | (uint64_t *) k - i->d, i->keys, pkey(k)); | 105 | (uint64_t *) k - i->d, i->keys, buf); |
107 | 106 | ||
108 | for (j = 0; j < KEY_PTRS(k); j++) { | 107 | for (j = 0; j < KEY_PTRS(k); j++) { |
109 | size_t n = PTR_BUCKET_NR(b->c, k, j); | 108 | size_t n = PTR_BUCKET_NR(b->c, k, j); |
@@ -144,7 +143,7 @@ void bch_btree_verify(struct btree *b, struct bset *new) | |||
144 | v->written = 0; | 143 | v->written = 0; |
145 | v->level = b->level; | 144 | v->level = b->level; |
146 | 145 | ||
147 | bch_btree_read(v); | 146 | bch_btree_node_read(v); |
148 | closure_wait_event(&v->io.wait, &cl, | 147 | closure_wait_event(&v->io.wait, &cl, |
149 | atomic_read(&b->io.cl.remaining) == -1); | 148 | atomic_read(&b->io.cl.remaining) == -1); |
150 | 149 | ||
@@ -200,7 +199,7 @@ void bch_data_verify(struct search *s) | |||
200 | if (!check) | 199 | if (!check) |
201 | return; | 200 | return; |
202 | 201 | ||
203 | if (bch_bio_alloc_pages(check, GFP_NOIO)) | 202 | if (bio_alloc_pages(check, GFP_NOIO)) |
204 | goto out_put; | 203 | goto out_put; |
205 | 204 | ||
206 | check->bi_rw = READ_SYNC; | 205 | check->bi_rw = READ_SYNC; |
@@ -252,6 +251,7 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt, | |||
252 | va_list args) | 251 | va_list args) |
253 | { | 252 | { |
254 | unsigned i; | 253 | unsigned i; |
254 | char buf[80]; | ||
255 | 255 | ||
256 | console_lock(); | 256 | console_lock(); |
257 | 257 | ||
@@ -262,7 +262,8 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt, | |||
262 | 262 | ||
263 | console_unlock(); | 263 | console_unlock(); |
264 | 264 | ||
265 | panic("at %s\n", pbtree(b)); | 265 | bch_btree_to_text(buf, sizeof(buf), b); |
266 | panic("at %s\n", buf); | ||
266 | } | 267 | } |
267 | 268 | ||
268 | void bch_check_key_order_msg(struct btree *b, struct bset *i, | 269 | void bch_check_key_order_msg(struct btree *b, struct bset *i, |
@@ -337,6 +338,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, | |||
337 | { | 338 | { |
338 | struct dump_iterator *i = file->private_data; | 339 | struct dump_iterator *i = file->private_data; |
339 | ssize_t ret = 0; | 340 | ssize_t ret = 0; |
341 | char kbuf[80]; | ||
340 | 342 | ||
341 | while (size) { | 343 | while (size) { |
342 | struct keybuf_key *w; | 344 | struct keybuf_key *w; |
@@ -355,11 +357,12 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, | |||
355 | if (i->bytes) | 357 | if (i->bytes) |
356 | break; | 358 | break; |
357 | 359 | ||
358 | w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY); | 360 | w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred); |
359 | if (!w) | 361 | if (!w) |
360 | break; | 362 | break; |
361 | 363 | ||
362 | i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key)); | 364 | bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); |
365 | i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); | ||
363 | bch_keybuf_del(&i->keys, w); | 366 | bch_keybuf_del(&i->keys, w); |
364 | } | 367 | } |
365 | 368 | ||
@@ -377,7 +380,7 @@ static int bch_dump_open(struct inode *inode, struct file *file) | |||
377 | 380 | ||
378 | file->private_data = i; | 381 | file->private_data = i; |
379 | i->c = c; | 382 | i->c = c; |
380 | bch_keybuf_init(&i->keys, dump_pred); | 383 | bch_keybuf_init(&i->keys); |
381 | i->keys.last_scanned = KEY(0, 0, 0); | 384 | i->keys.last_scanned = KEY(0, 0, 0); |
382 | 385 | ||
383 | return 0; | 386 | return 0; |
@@ -409,142 +412,6 @@ void bch_debug_init_cache_set(struct cache_set *c) | |||
409 | 412 | ||
410 | #endif | 413 | #endif |
411 | 414 | ||
412 | /* Fuzz tester has rotted: */ | ||
413 | #if 0 | ||
414 | |||
415 | static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a, | ||
416 | const char *buffer, size_t size) | ||
417 | { | ||
418 | void dump(struct btree *b) | ||
419 | { | ||
420 | struct bset *i; | ||
421 | |||
422 | for (i = b->sets[0].data; | ||
423 | index(i, b) < btree_blocks(b) && | ||
424 | i->seq == b->sets[0].data->seq; | ||
425 | i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c)) | ||
426 | dump_bset(b, i); | ||
427 | } | ||
428 | |||
429 | struct cache_sb *sb; | ||
430 | struct cache_set *c; | ||
431 | struct btree *all[3], *b, *fill, *orig; | ||
432 | int j; | ||
433 | |||
434 | struct btree_op op; | ||
435 | bch_btree_op_init_stack(&op); | ||
436 | |||
437 | sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL); | ||
438 | if (!sb) | ||
439 | return -ENOMEM; | ||
440 | |||
441 | sb->bucket_size = 128; | ||
442 | sb->block_size = 4; | ||
443 | |||
444 | c = bch_cache_set_alloc(sb); | ||
445 | if (!c) | ||
446 | return -ENOMEM; | ||
447 | |||
448 | for (j = 0; j < 3; j++) { | ||
449 | BUG_ON(list_empty(&c->btree_cache)); | ||
450 | all[j] = list_first_entry(&c->btree_cache, struct btree, list); | ||
451 | list_del_init(&all[j]->list); | ||
452 | |||
453 | all[j]->key = KEY(0, 0, c->sb.bucket_size); | ||
454 | bkey_copy_key(&all[j]->key, &MAX_KEY); | ||
455 | } | ||
456 | |||
457 | b = all[0]; | ||
458 | fill = all[1]; | ||
459 | orig = all[2]; | ||
460 | |||
461 | while (1) { | ||
462 | for (j = 0; j < 3; j++) | ||
463 | all[j]->written = all[j]->nsets = 0; | ||
464 | |||
465 | bch_bset_init_next(b); | ||
466 | |||
467 | while (1) { | ||
468 | struct bset *i = write_block(b); | ||
469 | struct bkey *k = op.keys.top; | ||
470 | unsigned rand; | ||
471 | |||
472 | bkey_init(k); | ||
473 | rand = get_random_int(); | ||
474 | |||
475 | op.type = rand & 1 | ||
476 | ? BTREE_INSERT | ||
477 | : BTREE_REPLACE; | ||
478 | rand >>= 1; | ||
479 | |||
480 | SET_KEY_SIZE(k, bucket_remainder(c, rand)); | ||
481 | rand >>= c->bucket_bits; | ||
482 | rand &= 1024 * 512 - 1; | ||
483 | rand += c->sb.bucket_size; | ||
484 | SET_KEY_OFFSET(k, rand); | ||
485 | #if 0 | ||
486 | SET_KEY_PTRS(k, 1); | ||
487 | #endif | ||
488 | bch_keylist_push(&op.keys); | ||
489 | bch_btree_insert_keys(b, &op); | ||
490 | |||
491 | if (should_split(b) || | ||
492 | set_blocks(i, b->c) != | ||
493 | __set_blocks(i, i->keys + 15, b->c)) { | ||
494 | i->csum = csum_set(i); | ||
495 | |||
496 | memcpy(write_block(fill), | ||
497 | i, set_bytes(i)); | ||
498 | |||
499 | b->written += set_blocks(i, b->c); | ||
500 | fill->written = b->written; | ||
501 | if (b->written == btree_blocks(b)) | ||
502 | break; | ||
503 | |||
504 | bch_btree_sort_lazy(b); | ||
505 | bch_bset_init_next(b); | ||
506 | } | ||
507 | } | ||
508 | |||
509 | memcpy(orig->sets[0].data, | ||
510 | fill->sets[0].data, | ||
511 | btree_bytes(c)); | ||
512 | |||
513 | bch_btree_sort(b); | ||
514 | fill->written = 0; | ||
515 | bch_btree_read_done(&fill->io.cl); | ||
516 | |||
517 | if (b->sets[0].data->keys != fill->sets[0].data->keys || | ||
518 | memcmp(b->sets[0].data->start, | ||
519 | fill->sets[0].data->start, | ||
520 | b->sets[0].data->keys * sizeof(uint64_t))) { | ||
521 | struct bset *i = b->sets[0].data; | ||
522 | struct bkey *k, *l; | ||
523 | |||
524 | for (k = i->start, | ||
525 | l = fill->sets[0].data->start; | ||
526 | k < end(i); | ||
527 | k = bkey_next(k), l = bkey_next(l)) | ||
528 | if (bkey_cmp(k, l) || | ||
529 | KEY_SIZE(k) != KEY_SIZE(l)) | ||
530 | pr_err("key %zi differs: %s != %s", | ||
531 | (uint64_t *) k - i->d, | ||
532 | pkey(k), pkey(l)); | ||
533 | |||
534 | for (j = 0; j < 3; j++) { | ||
535 | pr_err("**** Set %i ****", j); | ||
536 | dump(all[j]); | ||
537 | } | ||
538 | panic("\n"); | ||
539 | } | ||
540 | |||
541 | pr_info("fuzz complete: %i keys", b->sets[0].data->keys); | ||
542 | } | ||
543 | } | ||
544 | |||
545 | kobj_attribute_write(fuzz, btree_fuzz); | ||
546 | #endif | ||
547 | |||
548 | void bch_debug_exit(void) | 415 | void bch_debug_exit(void) |
549 | { | 416 | { |
550 | if (!IS_ERR_OR_NULL(debug)) | 417 | if (!IS_ERR_OR_NULL(debug)) |
@@ -554,11 +421,6 @@ void bch_debug_exit(void) | |||
554 | int __init bch_debug_init(struct kobject *kobj) | 421 | int __init bch_debug_init(struct kobject *kobj) |
555 | { | 422 | { |
556 | int ret = 0; | 423 | int ret = 0; |
557 | #if 0 | ||
558 | ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr); | ||
559 | if (ret) | ||
560 | return ret; | ||
561 | #endif | ||
562 | 424 | ||
563 | debug = debugfs_create_dir("bcache", NULL); | 425 | debug = debugfs_create_dir("bcache", NULL); |
564 | return ret; | 426 | return ret; |
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index f9378a218148..1c39b5a2489b 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h | |||
@@ -3,15 +3,8 @@ | |||
3 | 3 | ||
4 | /* Btree/bkey debug printing */ | 4 | /* Btree/bkey debug printing */ |
5 | 5 | ||
6 | #define KEYHACK_SIZE 80 | 6 | int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); |
7 | struct keyprint_hack { | 7 | int bch_btree_to_text(char *buf, size_t size, const struct btree *b); |
8 | char s[KEYHACK_SIZE]; | ||
9 | }; | ||
10 | |||
11 | struct keyprint_hack bch_pkey(const struct bkey *k); | ||
12 | struct keyprint_hack bch_pbtree(const struct btree *b); | ||
13 | #define pkey(k) (&bch_pkey(k).s[0]) | ||
14 | #define pbtree(b) (&bch_pbtree(b).s[0]) | ||
15 | 8 | ||
16 | #ifdef CONFIG_BCACHE_EDEBUG | 9 | #ifdef CONFIG_BCACHE_EDEBUG |
17 | 10 | ||
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 48efd4dea645..9056632995b1 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c | |||
@@ -9,6 +9,8 @@ | |||
9 | #include "bset.h" | 9 | #include "bset.h" |
10 | #include "debug.h" | 10 | #include "debug.h" |
11 | 11 | ||
12 | #include <linux/blkdev.h> | ||
13 | |||
12 | static void bch_bi_idx_hack_endio(struct bio *bio, int error) | 14 | static void bch_bi_idx_hack_endio(struct bio *bio, int error) |
13 | { | 15 | { |
14 | struct bio *p = bio->bi_private; | 16 | struct bio *p = bio->bi_private; |
@@ -66,13 +68,6 @@ static void bch_generic_make_request_hack(struct bio *bio) | |||
66 | * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a | 68 | * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a |
67 | * bvec boundry; it is the caller's responsibility to ensure that @bio is not | 69 | * bvec boundry; it is the caller's responsibility to ensure that @bio is not |
68 | * freed before the split. | 70 | * freed before the split. |
69 | * | ||
70 | * If bch_bio_split() is running under generic_make_request(), it's not safe to | ||
71 | * allocate more than one bio from the same bio set. Therefore, if it is running | ||
72 | * under generic_make_request() it masks out __GFP_WAIT when doing the | ||
73 | * allocation. The caller must check for failure if there's any possibility of | ||
74 | * it being called from under generic_make_request(); it is then the caller's | ||
75 | * responsibility to retry from a safe context (by e.g. punting to workqueue). | ||
76 | */ | 71 | */ |
77 | struct bio *bch_bio_split(struct bio *bio, int sectors, | 72 | struct bio *bch_bio_split(struct bio *bio, int sectors, |
78 | gfp_t gfp, struct bio_set *bs) | 73 | gfp_t gfp, struct bio_set *bs) |
@@ -83,20 +78,13 @@ struct bio *bch_bio_split(struct bio *bio, int sectors, | |||
83 | 78 | ||
84 | BUG_ON(sectors <= 0); | 79 | BUG_ON(sectors <= 0); |
85 | 80 | ||
86 | /* | ||
87 | * If we're being called from underneath generic_make_request() and we | ||
88 | * already allocated any bios from this bio set, we risk deadlock if we | ||
89 | * use the mempool. So instead, we possibly fail and let the caller punt | ||
90 | * to workqueue or somesuch and retry in a safe context. | ||
91 | */ | ||
92 | if (current->bio_list) | ||
93 | gfp &= ~__GFP_WAIT; | ||
94 | |||
95 | if (sectors >= bio_sectors(bio)) | 81 | if (sectors >= bio_sectors(bio)) |
96 | return bio; | 82 | return bio; |
97 | 83 | ||
98 | if (bio->bi_rw & REQ_DISCARD) { | 84 | if (bio->bi_rw & REQ_DISCARD) { |
99 | ret = bio_alloc_bioset(gfp, 1, bs); | 85 | ret = bio_alloc_bioset(gfp, 1, bs); |
86 | if (!ret) | ||
87 | return NULL; | ||
100 | idx = 0; | 88 | idx = 0; |
101 | goto out; | 89 | goto out; |
102 | } | 90 | } |
@@ -160,17 +148,18 @@ static unsigned bch_bio_max_sectors(struct bio *bio) | |||
160 | struct request_queue *q = bdev_get_queue(bio->bi_bdev); | 148 | struct request_queue *q = bdev_get_queue(bio->bi_bdev); |
161 | unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, | 149 | unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, |
162 | queue_max_segments(q)); | 150 | queue_max_segments(q)); |
163 | struct bio_vec *bv, *end = bio_iovec(bio) + | ||
164 | min_t(int, bio_segments(bio), max_segments); | ||
165 | 151 | ||
166 | if (bio->bi_rw & REQ_DISCARD) | 152 | if (bio->bi_rw & REQ_DISCARD) |
167 | return min(ret, q->limits.max_discard_sectors); | 153 | return min(ret, q->limits.max_discard_sectors); |
168 | 154 | ||
169 | if (bio_segments(bio) > max_segments || | 155 | if (bio_segments(bio) > max_segments || |
170 | q->merge_bvec_fn) { | 156 | q->merge_bvec_fn) { |
157 | struct bio_vec *bv; | ||
158 | int i, seg = 0; | ||
159 | |||
171 | ret = 0; | 160 | ret = 0; |
172 | 161 | ||
173 | for (bv = bio_iovec(bio); bv < end; bv++) { | 162 | bio_for_each_segment(bv, bio, i) { |
174 | struct bvec_merge_data bvm = { | 163 | struct bvec_merge_data bvm = { |
175 | .bi_bdev = bio->bi_bdev, | 164 | .bi_bdev = bio->bi_bdev, |
176 | .bi_sector = bio->bi_sector, | 165 | .bi_sector = bio->bi_sector, |
@@ -178,10 +167,14 @@ static unsigned bch_bio_max_sectors(struct bio *bio) | |||
178 | .bi_rw = bio->bi_rw, | 167 | .bi_rw = bio->bi_rw, |
179 | }; | 168 | }; |
180 | 169 | ||
170 | if (seg == max_segments) | ||
171 | break; | ||
172 | |||
181 | if (q->merge_bvec_fn && | 173 | if (q->merge_bvec_fn && |
182 | q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) | 174 | q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) |
183 | break; | 175 | break; |
184 | 176 | ||
177 | seg++; | ||
185 | ret += bv->bv_len >> 9; | 178 | ret += bv->bv_len >> 9; |
186 | } | 179 | } |
187 | } | 180 | } |
@@ -218,30 +211,10 @@ static void bch_bio_submit_split_endio(struct bio *bio, int error) | |||
218 | closure_put(cl); | 211 | closure_put(cl); |
219 | } | 212 | } |
220 | 213 | ||
221 | static void __bch_bio_submit_split(struct closure *cl) | ||
222 | { | ||
223 | struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); | ||
224 | struct bio *bio = s->bio, *n; | ||
225 | |||
226 | do { | ||
227 | n = bch_bio_split(bio, bch_bio_max_sectors(bio), | ||
228 | GFP_NOIO, s->p->bio_split); | ||
229 | if (!n) | ||
230 | continue_at(cl, __bch_bio_submit_split, system_wq); | ||
231 | |||
232 | n->bi_end_io = bch_bio_submit_split_endio; | ||
233 | n->bi_private = cl; | ||
234 | |||
235 | closure_get(cl); | ||
236 | bch_generic_make_request_hack(n); | ||
237 | } while (n != bio); | ||
238 | |||
239 | continue_at(cl, bch_bio_submit_split_done, NULL); | ||
240 | } | ||
241 | |||
242 | void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) | 214 | void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) |
243 | { | 215 | { |
244 | struct bio_split_hook *s; | 216 | struct bio_split_hook *s; |
217 | struct bio *n; | ||
245 | 218 | ||
246 | if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) | 219 | if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) |
247 | goto submit; | 220 | goto submit; |
@@ -250,6 +223,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) | |||
250 | goto submit; | 223 | goto submit; |
251 | 224 | ||
252 | s = mempool_alloc(p->bio_split_hook, GFP_NOIO); | 225 | s = mempool_alloc(p->bio_split_hook, GFP_NOIO); |
226 | closure_init(&s->cl, NULL); | ||
253 | 227 | ||
254 | s->bio = bio; | 228 | s->bio = bio; |
255 | s->p = p; | 229 | s->p = p; |
@@ -257,8 +231,18 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) | |||
257 | s->bi_private = bio->bi_private; | 231 | s->bi_private = bio->bi_private; |
258 | bio_get(bio); | 232 | bio_get(bio); |
259 | 233 | ||
260 | closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL); | 234 | do { |
261 | return; | 235 | n = bch_bio_split(bio, bch_bio_max_sectors(bio), |
236 | GFP_NOIO, s->p->bio_split); | ||
237 | |||
238 | n->bi_end_io = bch_bio_submit_split_endio; | ||
239 | n->bi_private = &s->cl; | ||
240 | |||
241 | closure_get(&s->cl); | ||
242 | bch_generic_make_request_hack(n); | ||
243 | } while (n != bio); | ||
244 | |||
245 | continue_at(&s->cl, bch_bio_submit_split_done, NULL); | ||
262 | submit: | 246 | submit: |
263 | bch_generic_make_request_hack(bio); | 247 | bch_generic_make_request_hack(bio); |
264 | } | 248 | } |
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8c8dfdcd9d4c..ba95ab84b2be 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c | |||
@@ -9,6 +9,8 @@ | |||
9 | #include "debug.h" | 9 | #include "debug.h" |
10 | #include "request.h" | 10 | #include "request.h" |
11 | 11 | ||
12 | #include <trace/events/bcache.h> | ||
13 | |||
12 | /* | 14 | /* |
13 | * Journal replay/recovery: | 15 | * Journal replay/recovery: |
14 | * | 16 | * |
@@ -182,9 +184,14 @@ bsearch: | |||
182 | pr_debug("starting binary search, l %u r %u", l, r); | 184 | pr_debug("starting binary search, l %u r %u", l, r); |
183 | 185 | ||
184 | while (l + 1 < r) { | 186 | while (l + 1 < r) { |
187 | seq = list_entry(list->prev, struct journal_replay, | ||
188 | list)->j.seq; | ||
189 | |||
185 | m = (l + r) >> 1; | 190 | m = (l + r) >> 1; |
191 | read_bucket(m); | ||
186 | 192 | ||
187 | if (read_bucket(m)) | 193 | if (seq != list_entry(list->prev, struct journal_replay, |
194 | list)->j.seq) | ||
188 | l = m; | 195 | l = m; |
189 | else | 196 | else |
190 | r = m; | 197 | r = m; |
@@ -300,7 +307,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, | |||
300 | for (k = i->j.start; | 307 | for (k = i->j.start; |
301 | k < end(&i->j); | 308 | k < end(&i->j); |
302 | k = bkey_next(k)) { | 309 | k = bkey_next(k)) { |
303 | pr_debug("%s", pkey(k)); | 310 | trace_bcache_journal_replay_key(k); |
311 | |||
304 | bkey_copy(op->keys.top, k); | 312 | bkey_copy(op->keys.top, k); |
305 | bch_keylist_push(&op->keys); | 313 | bch_keylist_push(&op->keys); |
306 | 314 | ||
@@ -384,7 +392,7 @@ out: | |||
384 | return; | 392 | return; |
385 | found: | 393 | found: |
386 | if (btree_node_dirty(best)) | 394 | if (btree_node_dirty(best)) |
387 | bch_btree_write(best, true, NULL); | 395 | bch_btree_node_write(best, NULL); |
388 | rw_unlock(true, best); | 396 | rw_unlock(true, best); |
389 | } | 397 | } |
390 | 398 | ||
@@ -617,7 +625,7 @@ static void journal_write_unlocked(struct closure *cl) | |||
617 | bio_reset(bio); | 625 | bio_reset(bio); |
618 | bio->bi_sector = PTR_OFFSET(k, i); | 626 | bio->bi_sector = PTR_OFFSET(k, i); |
619 | bio->bi_bdev = ca->bdev; | 627 | bio->bi_bdev = ca->bdev; |
620 | bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH; | 628 | bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA; |
621 | bio->bi_size = sectors << 9; | 629 | bio->bi_size = sectors << 9; |
622 | 630 | ||
623 | bio->bi_end_io = journal_write_endio; | 631 | bio->bi_end_io = journal_write_endio; |
@@ -712,7 +720,8 @@ void bch_journal(struct closure *cl) | |||
712 | spin_lock(&c->journal.lock); | 720 | spin_lock(&c->journal.lock); |
713 | 721 | ||
714 | if (journal_full(&c->journal)) { | 722 | if (journal_full(&c->journal)) { |
715 | /* XXX: tracepoint */ | 723 | trace_bcache_journal_full(c); |
724 | |||
716 | closure_wait(&c->journal.wait, cl); | 725 | closure_wait(&c->journal.wait, cl); |
717 | 726 | ||
718 | journal_reclaim(c); | 727 | journal_reclaim(c); |
@@ -728,13 +737,15 @@ void bch_journal(struct closure *cl) | |||
728 | 737 | ||
729 | if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || | 738 | if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || |
730 | b > c->journal.blocks_free) { | 739 | b > c->journal.blocks_free) { |
731 | /* XXX: If we were inserting so many keys that they won't fit in | 740 | trace_bcache_journal_entry_full(c); |
741 | |||
742 | /* | ||
743 | * XXX: If we were inserting so many keys that they won't fit in | ||
732 | * an _empty_ journal write, we'll deadlock. For now, handle | 744 | * an _empty_ journal write, we'll deadlock. For now, handle |
733 | * this in bch_keylist_realloc() - but something to think about. | 745 | * this in bch_keylist_realloc() - but something to think about. |
734 | */ | 746 | */ |
735 | BUG_ON(!w->data->keys); | 747 | BUG_ON(!w->data->keys); |
736 | 748 | ||
737 | /* XXX: tracepoint */ | ||
738 | BUG_ON(!closure_wait(&w->wait, cl)); | 749 | BUG_ON(!closure_wait(&w->wait, cl)); |
739 | 750 | ||
740 | closure_flush(&c->journal.io); | 751 | closure_flush(&c->journal.io); |
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 8589512c972e..1a3b4f4786c3 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c | |||
@@ -9,6 +9,8 @@ | |||
9 | #include "debug.h" | 9 | #include "debug.h" |
10 | #include "request.h" | 10 | #include "request.h" |
11 | 11 | ||
12 | #include <trace/events/bcache.h> | ||
13 | |||
12 | struct moving_io { | 14 | struct moving_io { |
13 | struct keybuf_key *w; | 15 | struct keybuf_key *w; |
14 | struct search s; | 16 | struct search s; |
@@ -44,14 +46,14 @@ static void write_moving_finish(struct closure *cl) | |||
44 | { | 46 | { |
45 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); | 47 | struct moving_io *io = container_of(cl, struct moving_io, s.cl); |
46 | struct bio *bio = &io->bio.bio; | 48 | struct bio *bio = &io->bio.bio; |
47 | struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt); | 49 | struct bio_vec *bv; |
50 | int i; | ||
48 | 51 | ||
49 | while (bv-- != bio->bi_io_vec) | 52 | bio_for_each_segment_all(bv, bio, i) |
50 | __free_page(bv->bv_page); | 53 | __free_page(bv->bv_page); |
51 | 54 | ||
52 | pr_debug("%s %s", io->s.op.insert_collision | 55 | if (io->s.op.insert_collision) |
53 | ? "collision moving" : "moved", | 56 | trace_bcache_gc_copy_collision(&io->w->key); |
54 | pkey(&io->w->key)); | ||
55 | 57 | ||
56 | bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); | 58 | bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); |
57 | 59 | ||
@@ -94,8 +96,6 @@ static void write_moving(struct closure *cl) | |||
94 | struct moving_io *io = container_of(s, struct moving_io, s); | 96 | struct moving_io *io = container_of(s, struct moving_io, s); |
95 | 97 | ||
96 | if (!s->error) { | 98 | if (!s->error) { |
97 | trace_bcache_write_moving(&io->bio.bio); | ||
98 | |||
99 | moving_init(io); | 99 | moving_init(io); |
100 | 100 | ||
101 | io->bio.bio.bi_sector = KEY_START(&io->w->key); | 101 | io->bio.bio.bi_sector = KEY_START(&io->w->key); |
@@ -122,7 +122,6 @@ static void read_moving_submit(struct closure *cl) | |||
122 | struct moving_io *io = container_of(s, struct moving_io, s); | 122 | struct moving_io *io = container_of(s, struct moving_io, s); |
123 | struct bio *bio = &io->bio.bio; | 123 | struct bio *bio = &io->bio.bio; |
124 | 124 | ||
125 | trace_bcache_read_moving(bio); | ||
126 | bch_submit_bbio(bio, s->op.c, &io->w->key, 0); | 125 | bch_submit_bbio(bio, s->op.c, &io->w->key, 0); |
127 | 126 | ||
128 | continue_at(cl, write_moving, bch_gc_wq); | 127 | continue_at(cl, write_moving, bch_gc_wq); |
@@ -138,7 +137,8 @@ static void read_moving(struct closure *cl) | |||
138 | /* XXX: if we error, background writeback could stall indefinitely */ | 137 | /* XXX: if we error, background writeback could stall indefinitely */ |
139 | 138 | ||
140 | while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { | 139 | while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { |
141 | w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY); | 140 | w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, |
141 | &MAX_KEY, moving_pred); | ||
142 | if (!w) | 142 | if (!w) |
143 | break; | 143 | break; |
144 | 144 | ||
@@ -159,10 +159,10 @@ static void read_moving(struct closure *cl) | |||
159 | bio->bi_rw = READ; | 159 | bio->bi_rw = READ; |
160 | bio->bi_end_io = read_moving_endio; | 160 | bio->bi_end_io = read_moving_endio; |
161 | 161 | ||
162 | if (bch_bio_alloc_pages(bio, GFP_KERNEL)) | 162 | if (bio_alloc_pages(bio, GFP_KERNEL)) |
163 | goto err; | 163 | goto err; |
164 | 164 | ||
165 | pr_debug("%s", pkey(&w->key)); | 165 | trace_bcache_gc_copy(&w->key); |
166 | 166 | ||
167 | closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); | 167 | closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); |
168 | 168 | ||
@@ -250,5 +250,5 @@ void bch_moving_gc(struct closure *cl) | |||
250 | 250 | ||
251 | void bch_moving_init_cache_set(struct cache_set *c) | 251 | void bch_moving_init_cache_set(struct cache_set *c) |
252 | { | 252 | { |
253 | bch_keybuf_init(&c->moving_gc_keys, moving_pred); | 253 | bch_keybuf_init(&c->moving_gc_keys); |
254 | } | 254 | } |
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index e5ff12e52d5b..786a1a4f74d8 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include "btree.h" | 10 | #include "btree.h" |
11 | #include "debug.h" | 11 | #include "debug.h" |
12 | #include "request.h" | 12 | #include "request.h" |
13 | #include "writeback.h" | ||
13 | 14 | ||
14 | #include <linux/cgroup.h> | 15 | #include <linux/cgroup.h> |
15 | #include <linux/module.h> | 16 | #include <linux/module.h> |
@@ -21,8 +22,6 @@ | |||
21 | 22 | ||
22 | #define CUTOFF_CACHE_ADD 95 | 23 | #define CUTOFF_CACHE_ADD 95 |
23 | #define CUTOFF_CACHE_READA 90 | 24 | #define CUTOFF_CACHE_READA 90 |
24 | #define CUTOFF_WRITEBACK 50 | ||
25 | #define CUTOFF_WRITEBACK_SYNC 75 | ||
26 | 25 | ||
27 | struct kmem_cache *bch_search_cache; | 26 | struct kmem_cache *bch_search_cache; |
28 | 27 | ||
@@ -489,6 +488,12 @@ static void bch_insert_data_loop(struct closure *cl) | |||
489 | bch_queue_gc(op->c); | 488 | bch_queue_gc(op->c); |
490 | } | 489 | } |
491 | 490 | ||
491 | /* | ||
492 | * Journal writes are marked REQ_FLUSH; if the original write was a | ||
493 | * flush, it'll wait on the journal write. | ||
494 | */ | ||
495 | bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA); | ||
496 | |||
492 | do { | 497 | do { |
493 | unsigned i; | 498 | unsigned i; |
494 | struct bkey *k; | 499 | struct bkey *k; |
@@ -510,10 +515,6 @@ static void bch_insert_data_loop(struct closure *cl) | |||
510 | goto err; | 515 | goto err; |
511 | 516 | ||
512 | n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); | 517 | n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); |
513 | if (!n) { | ||
514 | __bkey_put(op->c, k); | ||
515 | continue_at(cl, bch_insert_data_loop, bcache_wq); | ||
516 | } | ||
517 | 518 | ||
518 | n->bi_end_io = bch_insert_data_endio; | 519 | n->bi_end_io = bch_insert_data_endio; |
519 | n->bi_private = cl; | 520 | n->bi_private = cl; |
@@ -530,10 +531,9 @@ static void bch_insert_data_loop(struct closure *cl) | |||
530 | if (KEY_CSUM(k)) | 531 | if (KEY_CSUM(k)) |
531 | bio_csum(n, k); | 532 | bio_csum(n, k); |
532 | 533 | ||
533 | pr_debug("%s", pkey(k)); | 534 | trace_bcache_cache_insert(k); |
534 | bch_keylist_push(&op->keys); | 535 | bch_keylist_push(&op->keys); |
535 | 536 | ||
536 | trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev); | ||
537 | n->bi_rw |= REQ_WRITE; | 537 | n->bi_rw |= REQ_WRITE; |
538 | bch_submit_bbio(n, op->c, k, 0); | 538 | bch_submit_bbio(n, op->c, k, 0); |
539 | } while (n != bio); | 539 | } while (n != bio); |
@@ -716,7 +716,7 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) | |||
716 | s->task = current; | 716 | s->task = current; |
717 | s->orig_bio = bio; | 717 | s->orig_bio = bio; |
718 | s->write = (bio->bi_rw & REQ_WRITE) != 0; | 718 | s->write = (bio->bi_rw & REQ_WRITE) != 0; |
719 | s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; | 719 | s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; |
720 | s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; | 720 | s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; |
721 | s->recoverable = 1; | 721 | s->recoverable = 1; |
722 | s->start_time = jiffies; | 722 | s->start_time = jiffies; |
@@ -784,11 +784,8 @@ static void request_read_error(struct closure *cl) | |||
784 | int i; | 784 | int i; |
785 | 785 | ||
786 | if (s->recoverable) { | 786 | if (s->recoverable) { |
787 | /* The cache read failed, but we can retry from the backing | 787 | /* Retry from the backing device: */ |
788 | * device. | 788 | trace_bcache_read_retry(s->orig_bio); |
789 | */ | ||
790 | pr_debug("recovering at sector %llu", | ||
791 | (uint64_t) s->orig_bio->bi_sector); | ||
792 | 789 | ||
793 | s->error = 0; | 790 | s->error = 0; |
794 | bv = s->bio.bio.bi_io_vec; | 791 | bv = s->bio.bio.bi_io_vec; |
@@ -806,7 +803,6 @@ static void request_read_error(struct closure *cl) | |||
806 | 803 | ||
807 | /* XXX: invalidate cache */ | 804 | /* XXX: invalidate cache */ |
808 | 805 | ||
809 | trace_bcache_read_retry(&s->bio.bio); | ||
810 | closure_bio_submit(&s->bio.bio, &s->cl, s->d); | 806 | closure_bio_submit(&s->bio.bio, &s->cl, s->d); |
811 | } | 807 | } |
812 | 808 | ||
@@ -827,53 +823,13 @@ static void request_read_done(struct closure *cl) | |||
827 | */ | 823 | */ |
828 | 824 | ||
829 | if (s->op.cache_bio) { | 825 | if (s->op.cache_bio) { |
830 | struct bio_vec *src, *dst; | ||
831 | unsigned src_offset, dst_offset, bytes; | ||
832 | void *dst_ptr; | ||
833 | |||
834 | bio_reset(s->op.cache_bio); | 826 | bio_reset(s->op.cache_bio); |
835 | s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; | 827 | s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; |
836 | s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; | 828 | s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; |
837 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; | 829 | s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; |
838 | bch_bio_map(s->op.cache_bio, NULL); | 830 | bch_bio_map(s->op.cache_bio, NULL); |
839 | 831 | ||
840 | src = bio_iovec(s->op.cache_bio); | 832 | bio_copy_data(s->cache_miss, s->op.cache_bio); |
841 | dst = bio_iovec(s->cache_miss); | ||
842 | src_offset = src->bv_offset; | ||
843 | dst_offset = dst->bv_offset; | ||
844 | dst_ptr = kmap(dst->bv_page); | ||
845 | |||
846 | while (1) { | ||
847 | if (dst_offset == dst->bv_offset + dst->bv_len) { | ||
848 | kunmap(dst->bv_page); | ||
849 | dst++; | ||
850 | if (dst == bio_iovec_idx(s->cache_miss, | ||
851 | s->cache_miss->bi_vcnt)) | ||
852 | break; | ||
853 | |||
854 | dst_offset = dst->bv_offset; | ||
855 | dst_ptr = kmap(dst->bv_page); | ||
856 | } | ||
857 | |||
858 | if (src_offset == src->bv_offset + src->bv_len) { | ||
859 | src++; | ||
860 | if (src == bio_iovec_idx(s->op.cache_bio, | ||
861 | s->op.cache_bio->bi_vcnt)) | ||
862 | BUG(); | ||
863 | |||
864 | src_offset = src->bv_offset; | ||
865 | } | ||
866 | |||
867 | bytes = min(dst->bv_offset + dst->bv_len - dst_offset, | ||
868 | src->bv_offset + src->bv_len - src_offset); | ||
869 | |||
870 | memcpy(dst_ptr + dst_offset, | ||
871 | page_address(src->bv_page) + src_offset, | ||
872 | bytes); | ||
873 | |||
874 | src_offset += bytes; | ||
875 | dst_offset += bytes; | ||
876 | } | ||
877 | 833 | ||
878 | bio_put(s->cache_miss); | 834 | bio_put(s->cache_miss); |
879 | s->cache_miss = NULL; | 835 | s->cache_miss = NULL; |
@@ -899,6 +855,7 @@ static void request_read_done_bh(struct closure *cl) | |||
899 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); | 855 | struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); |
900 | 856 | ||
901 | bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); | 857 | bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); |
858 | trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); | ||
902 | 859 | ||
903 | if (s->error) | 860 | if (s->error) |
904 | continue_at_nobarrier(cl, request_read_error, bcache_wq); | 861 | continue_at_nobarrier(cl, request_read_error, bcache_wq); |
@@ -917,9 +874,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, | |||
917 | struct bio *miss; | 874 | struct bio *miss; |
918 | 875 | ||
919 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); | 876 | miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); |
920 | if (!miss) | ||
921 | return -EAGAIN; | ||
922 | |||
923 | if (miss == bio) | 877 | if (miss == bio) |
924 | s->op.lookup_done = true; | 878 | s->op.lookup_done = true; |
925 | 879 | ||
@@ -938,8 +892,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, | |||
938 | reada = min(dc->readahead >> 9, | 892 | reada = min(dc->readahead >> 9, |
939 | sectors - bio_sectors(miss)); | 893 | sectors - bio_sectors(miss)); |
940 | 894 | ||
941 | if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) | 895 | if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev)) |
942 | reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); | 896 | reada = bdev_sectors(miss->bi_bdev) - |
897 | bio_end_sector(miss); | ||
943 | } | 898 | } |
944 | 899 | ||
945 | s->cache_bio_sectors = bio_sectors(miss) + reada; | 900 | s->cache_bio_sectors = bio_sectors(miss) + reada; |
@@ -963,13 +918,12 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, | |||
963 | goto out_put; | 918 | goto out_put; |
964 | 919 | ||
965 | bch_bio_map(s->op.cache_bio, NULL); | 920 | bch_bio_map(s->op.cache_bio, NULL); |
966 | if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) | 921 | if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) |
967 | goto out_put; | 922 | goto out_put; |
968 | 923 | ||
969 | s->cache_miss = miss; | 924 | s->cache_miss = miss; |
970 | bio_get(s->op.cache_bio); | 925 | bio_get(s->op.cache_bio); |
971 | 926 | ||
972 | trace_bcache_cache_miss(s->orig_bio); | ||
973 | closure_bio_submit(s->op.cache_bio, &s->cl, s->d); | 927 | closure_bio_submit(s->op.cache_bio, &s->cl, s->d); |
974 | 928 | ||
975 | return ret; | 929 | return ret; |
@@ -1002,24 +956,13 @@ static void cached_dev_write_complete(struct closure *cl) | |||
1002 | cached_dev_bio_complete(cl); | 956 | cached_dev_bio_complete(cl); |
1003 | } | 957 | } |
1004 | 958 | ||
1005 | static bool should_writeback(struct cached_dev *dc, struct bio *bio) | ||
1006 | { | ||
1007 | unsigned threshold = (bio->bi_rw & REQ_SYNC) | ||
1008 | ? CUTOFF_WRITEBACK_SYNC | ||
1009 | : CUTOFF_WRITEBACK; | ||
1010 | |||
1011 | return !atomic_read(&dc->disk.detaching) && | ||
1012 | cache_mode(dc, bio) == CACHE_MODE_WRITEBACK && | ||
1013 | dc->disk.c->gc_stats.in_use < threshold; | ||
1014 | } | ||
1015 | |||
1016 | static void request_write(struct cached_dev *dc, struct search *s) | 959 | static void request_write(struct cached_dev *dc, struct search *s) |
1017 | { | 960 | { |
1018 | struct closure *cl = &s->cl; | 961 | struct closure *cl = &s->cl; |
1019 | struct bio *bio = &s->bio.bio; | 962 | struct bio *bio = &s->bio.bio; |
1020 | struct bkey start, end; | 963 | struct bkey start, end; |
1021 | start = KEY(dc->disk.id, bio->bi_sector, 0); | 964 | start = KEY(dc->disk.id, bio->bi_sector, 0); |
1022 | end = KEY(dc->disk.id, bio_end(bio), 0); | 965 | end = KEY(dc->disk.id, bio_end_sector(bio), 0); |
1023 | 966 | ||
1024 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); | 967 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); |
1025 | 968 | ||
@@ -1034,22 +977,37 @@ static void request_write(struct cached_dev *dc, struct search *s) | |||
1034 | if (bio->bi_rw & REQ_DISCARD) | 977 | if (bio->bi_rw & REQ_DISCARD) |
1035 | goto skip; | 978 | goto skip; |
1036 | 979 | ||
980 | if (should_writeback(dc, s->orig_bio, | ||
981 | cache_mode(dc, bio), | ||
982 | s->op.skip)) { | ||
983 | s->op.skip = false; | ||
984 | s->writeback = true; | ||
985 | } | ||
986 | |||
1037 | if (s->op.skip) | 987 | if (s->op.skip) |
1038 | goto skip; | 988 | goto skip; |
1039 | 989 | ||
1040 | if (should_writeback(dc, s->orig_bio)) | 990 | trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); |
1041 | s->writeback = true; | ||
1042 | 991 | ||
1043 | if (!s->writeback) { | 992 | if (!s->writeback) { |
1044 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, | 993 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, |
1045 | dc->disk.bio_split); | 994 | dc->disk.bio_split); |
1046 | 995 | ||
1047 | trace_bcache_writethrough(s->orig_bio); | ||
1048 | closure_bio_submit(bio, cl, s->d); | 996 | closure_bio_submit(bio, cl, s->d); |
1049 | } else { | 997 | } else { |
1050 | s->op.cache_bio = bio; | 998 | bch_writeback_add(dc); |
1051 | trace_bcache_writeback(s->orig_bio); | 999 | |
1052 | bch_writeback_add(dc, bio_sectors(bio)); | 1000 | if (s->op.flush_journal) { |
1001 | /* Also need to send a flush to the backing device */ | ||
1002 | s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, | ||
1003 | dc->disk.bio_split); | ||
1004 | |||
1005 | bio->bi_size = 0; | ||
1006 | bio->bi_vcnt = 0; | ||
1007 | closure_bio_submit(bio, cl, s->d); | ||
1008 | } else { | ||
1009 | s->op.cache_bio = bio; | ||
1010 | } | ||
1053 | } | 1011 | } |
1054 | out: | 1012 | out: |
1055 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); | 1013 | closure_call(&s->op.cl, bch_insert_data, NULL, cl); |
@@ -1058,7 +1016,6 @@ skip: | |||
1058 | s->op.skip = true; | 1016 | s->op.skip = true; |
1059 | s->op.cache_bio = s->orig_bio; | 1017 | s->op.cache_bio = s->orig_bio; |
1060 | bio_get(s->op.cache_bio); | 1018 | bio_get(s->op.cache_bio); |
1061 | trace_bcache_write_skip(s->orig_bio); | ||
1062 | 1019 | ||
1063 | if ((bio->bi_rw & REQ_DISCARD) && | 1020 | if ((bio->bi_rw & REQ_DISCARD) && |
1064 | !blk_queue_discard(bdev_get_queue(dc->bdev))) | 1021 | !blk_queue_discard(bdev_get_queue(dc->bdev))) |
@@ -1088,9 +1045,10 @@ static void request_nodata(struct cached_dev *dc, struct search *s) | |||
1088 | 1045 | ||
1089 | /* Cached devices - read & write stuff */ | 1046 | /* Cached devices - read & write stuff */ |
1090 | 1047 | ||
1091 | int bch_get_congested(struct cache_set *c) | 1048 | unsigned bch_get_congested(struct cache_set *c) |
1092 | { | 1049 | { |
1093 | int i; | 1050 | int i; |
1051 | long rand; | ||
1094 | 1052 | ||
1095 | if (!c->congested_read_threshold_us && | 1053 | if (!c->congested_read_threshold_us && |
1096 | !c->congested_write_threshold_us) | 1054 | !c->congested_write_threshold_us) |
@@ -1106,7 +1064,13 @@ int bch_get_congested(struct cache_set *c) | |||
1106 | 1064 | ||
1107 | i += CONGESTED_MAX; | 1065 | i += CONGESTED_MAX; |
1108 | 1066 | ||
1109 | return i <= 0 ? 1 : fract_exp_two(i, 6); | 1067 | if (i > 0) |
1068 | i = fract_exp_two(i, 6); | ||
1069 | |||
1070 | rand = get_random_int(); | ||
1071 | i -= bitmap_weight(&rand, BITS_PER_LONG); | ||
1072 | |||
1073 | return i > 0 ? i : 1; | ||
1110 | } | 1074 | } |
1111 | 1075 | ||
1112 | static void add_sequential(struct task_struct *t) | 1076 | static void add_sequential(struct task_struct *t) |
@@ -1126,10 +1090,8 @@ static void check_should_skip(struct cached_dev *dc, struct search *s) | |||
1126 | { | 1090 | { |
1127 | struct cache_set *c = s->op.c; | 1091 | struct cache_set *c = s->op.c; |
1128 | struct bio *bio = &s->bio.bio; | 1092 | struct bio *bio = &s->bio.bio; |
1129 | |||
1130 | long rand; | ||
1131 | int cutoff = bch_get_congested(c); | ||
1132 | unsigned mode = cache_mode(dc, bio); | 1093 | unsigned mode = cache_mode(dc, bio); |
1094 | unsigned sectors, congested = bch_get_congested(c); | ||
1133 | 1095 | ||
1134 | if (atomic_read(&dc->disk.detaching) || | 1096 | if (atomic_read(&dc->disk.detaching) || |
1135 | c->gc_stats.in_use > CUTOFF_CACHE_ADD || | 1097 | c->gc_stats.in_use > CUTOFF_CACHE_ADD || |
@@ -1147,17 +1109,14 @@ static void check_should_skip(struct cached_dev *dc, struct search *s) | |||
1147 | goto skip; | 1109 | goto skip; |
1148 | } | 1110 | } |
1149 | 1111 | ||
1150 | if (!cutoff) { | 1112 | if (!congested && !dc->sequential_cutoff) |
1151 | cutoff = dc->sequential_cutoff >> 9; | 1113 | goto rescale; |
1152 | 1114 | ||
1153 | if (!cutoff) | 1115 | if (!congested && |
1154 | goto rescale; | 1116 | mode == CACHE_MODE_WRITEBACK && |
1155 | 1117 | (bio->bi_rw & REQ_WRITE) && | |
1156 | if (mode == CACHE_MODE_WRITEBACK && | 1118 | (bio->bi_rw & REQ_SYNC)) |
1157 | (bio->bi_rw & REQ_WRITE) && | 1119 | goto rescale; |
1158 | (bio->bi_rw & REQ_SYNC)) | ||
1159 | goto rescale; | ||
1160 | } | ||
1161 | 1120 | ||
1162 | if (dc->sequential_merge) { | 1121 | if (dc->sequential_merge) { |
1163 | struct io *i; | 1122 | struct io *i; |
@@ -1177,7 +1136,7 @@ found: | |||
1177 | if (i->sequential + bio->bi_size > i->sequential) | 1136 | if (i->sequential + bio->bi_size > i->sequential) |
1178 | i->sequential += bio->bi_size; | 1137 | i->sequential += bio->bi_size; |
1179 | 1138 | ||
1180 | i->last = bio_end(bio); | 1139 | i->last = bio_end_sector(bio); |
1181 | i->jiffies = jiffies + msecs_to_jiffies(5000); | 1140 | i->jiffies = jiffies + msecs_to_jiffies(5000); |
1182 | s->task->sequential_io = i->sequential; | 1141 | s->task->sequential_io = i->sequential; |
1183 | 1142 | ||
@@ -1192,12 +1151,19 @@ found: | |||
1192 | add_sequential(s->task); | 1151 | add_sequential(s->task); |
1193 | } | 1152 | } |
1194 | 1153 | ||
1195 | rand = get_random_int(); | 1154 | sectors = max(s->task->sequential_io, |
1196 | cutoff -= bitmap_weight(&rand, BITS_PER_LONG); | 1155 | s->task->sequential_io_avg) >> 9; |
1197 | 1156 | ||
1198 | if (cutoff <= (int) (max(s->task->sequential_io, | 1157 | if (dc->sequential_cutoff && |
1199 | s->task->sequential_io_avg) >> 9)) | 1158 | sectors >= dc->sequential_cutoff >> 9) { |
1159 | trace_bcache_bypass_sequential(s->orig_bio); | ||
1200 | goto skip; | 1160 | goto skip; |
1161 | } | ||
1162 | |||
1163 | if (congested && sectors >= congested) { | ||
1164 | trace_bcache_bypass_congested(s->orig_bio); | ||
1165 | goto skip; | ||
1166 | } | ||
1201 | 1167 | ||
1202 | rescale: | 1168 | rescale: |
1203 | bch_rescale_priorities(c, bio_sectors(bio)); | 1169 | bch_rescale_priorities(c, bio_sectors(bio)); |
@@ -1288,30 +1254,25 @@ void bch_cached_dev_request_init(struct cached_dev *dc) | |||
1288 | static int flash_dev_cache_miss(struct btree *b, struct search *s, | 1254 | static int flash_dev_cache_miss(struct btree *b, struct search *s, |
1289 | struct bio *bio, unsigned sectors) | 1255 | struct bio *bio, unsigned sectors) |
1290 | { | 1256 | { |
1257 | struct bio_vec *bv; | ||
1258 | int i; | ||
1259 | |||
1291 | /* Zero fill bio */ | 1260 | /* Zero fill bio */ |
1292 | 1261 | ||
1293 | while (bio->bi_idx != bio->bi_vcnt) { | 1262 | bio_for_each_segment(bv, bio, i) { |
1294 | struct bio_vec *bv = bio_iovec(bio); | ||
1295 | unsigned j = min(bv->bv_len >> 9, sectors); | 1263 | unsigned j = min(bv->bv_len >> 9, sectors); |
1296 | 1264 | ||
1297 | void *p = kmap(bv->bv_page); | 1265 | void *p = kmap(bv->bv_page); |
1298 | memset(p + bv->bv_offset, 0, j << 9); | 1266 | memset(p + bv->bv_offset, 0, j << 9); |
1299 | kunmap(bv->bv_page); | 1267 | kunmap(bv->bv_page); |
1300 | 1268 | ||
1301 | bv->bv_len -= j << 9; | 1269 | sectors -= j; |
1302 | bv->bv_offset += j << 9; | ||
1303 | |||
1304 | if (bv->bv_len) | ||
1305 | return 0; | ||
1306 | |||
1307 | bio->bi_sector += j; | ||
1308 | bio->bi_size -= j << 9; | ||
1309 | |||
1310 | bio->bi_idx++; | ||
1311 | sectors -= j; | ||
1312 | } | 1270 | } |
1313 | 1271 | ||
1314 | s->op.lookup_done = true; | 1272 | bio_advance(bio, min(sectors << 9, bio->bi_size)); |
1273 | |||
1274 | if (!bio->bi_size) | ||
1275 | s->op.lookup_done = true; | ||
1315 | 1276 | ||
1316 | return 0; | 1277 | return 0; |
1317 | } | 1278 | } |
@@ -1338,8 +1299,8 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) | |||
1338 | closure_call(&s->op.cl, btree_read_async, NULL, cl); | 1299 | closure_call(&s->op.cl, btree_read_async, NULL, cl); |
1339 | } else if (bio_has_data(bio) || s->op.skip) { | 1300 | } else if (bio_has_data(bio) || s->op.skip) { |
1340 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, | 1301 | bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, |
1341 | &KEY(d->id, bio->bi_sector, 0), | 1302 | &KEY(d->id, bio->bi_sector, 0), |
1342 | &KEY(d->id, bio_end(bio), 0)); | 1303 | &KEY(d->id, bio_end_sector(bio), 0)); |
1343 | 1304 | ||
1344 | s->writeback = true; | 1305 | s->writeback = true; |
1345 | s->op.cache_bio = bio; | 1306 | s->op.cache_bio = bio; |
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 254d9ab5707c..57dc4784f4f4 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h | |||
@@ -30,7 +30,7 @@ struct search { | |||
30 | }; | 30 | }; |
31 | 31 | ||
32 | void bch_cache_read_endio(struct bio *, int); | 32 | void bch_cache_read_endio(struct bio *, int); |
33 | int bch_get_congested(struct cache_set *); | 33 | unsigned bch_get_congested(struct cache_set *); |
34 | void bch_insert_data(struct closure *cl); | 34 | void bch_insert_data(struct closure *cl); |
35 | void bch_btree_insert_async(struct closure *); | 35 | void bch_btree_insert_async(struct closure *); |
36 | void bch_cache_read_endio(struct bio *, int); | 36 | void bch_cache_read_endio(struct bio *, int); |
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f88e2b653a3f..547c4c57b052 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c | |||
@@ -10,10 +10,13 @@ | |||
10 | #include "btree.h" | 10 | #include "btree.h" |
11 | #include "debug.h" | 11 | #include "debug.h" |
12 | #include "request.h" | 12 | #include "request.h" |
13 | #include "writeback.h" | ||
13 | 14 | ||
15 | #include <linux/blkdev.h> | ||
14 | #include <linux/buffer_head.h> | 16 | #include <linux/buffer_head.h> |
15 | #include <linux/debugfs.h> | 17 | #include <linux/debugfs.h> |
16 | #include <linux/genhd.h> | 18 | #include <linux/genhd.h> |
19 | #include <linux/kthread.h> | ||
17 | #include <linux/module.h> | 20 | #include <linux/module.h> |
18 | #include <linux/random.h> | 21 | #include <linux/random.h> |
19 | #include <linux/reboot.h> | 22 | #include <linux/reboot.h> |
@@ -342,6 +345,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw, | |||
342 | struct closure *cl = &c->uuid_write.cl; | 345 | struct closure *cl = &c->uuid_write.cl; |
343 | struct uuid_entry *u; | 346 | struct uuid_entry *u; |
344 | unsigned i; | 347 | unsigned i; |
348 | char buf[80]; | ||
345 | 349 | ||
346 | BUG_ON(!parent); | 350 | BUG_ON(!parent); |
347 | closure_lock(&c->uuid_write, parent); | 351 | closure_lock(&c->uuid_write, parent); |
@@ -362,8 +366,8 @@ static void uuid_io(struct cache_set *c, unsigned long rw, | |||
362 | break; | 366 | break; |
363 | } | 367 | } |
364 | 368 | ||
365 | pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", | 369 | bch_bkey_to_text(buf, sizeof(buf), k); |
366 | pkey(&c->uuid_bucket)); | 370 | pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); |
367 | 371 | ||
368 | for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) | 372 | for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) |
369 | if (!bch_is_zero(u->uuid, 16)) | 373 | if (!bch_is_zero(u->uuid, 16)) |
@@ -543,7 +547,6 @@ void bch_prio_write(struct cache *ca) | |||
543 | 547 | ||
544 | pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), | 548 | pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), |
545 | fifo_used(&ca->free_inc), fifo_used(&ca->unused)); | 549 | fifo_used(&ca->free_inc), fifo_used(&ca->unused)); |
546 | blktrace_msg(ca, "Starting priorities: " buckets_free(ca)); | ||
547 | 550 | ||
548 | for (i = prio_buckets(ca) - 1; i >= 0; --i) { | 551 | for (i = prio_buckets(ca) - 1; i >= 0; --i) { |
549 | long bucket; | 552 | long bucket; |
@@ -704,7 +707,8 @@ static void bcache_device_detach(struct bcache_device *d) | |||
704 | atomic_set(&d->detaching, 0); | 707 | atomic_set(&d->detaching, 0); |
705 | } | 708 | } |
706 | 709 | ||
707 | bcache_device_unlink(d); | 710 | if (!d->flush_done) |
711 | bcache_device_unlink(d); | ||
708 | 712 | ||
709 | d->c->devices[d->id] = NULL; | 713 | d->c->devices[d->id] = NULL; |
710 | closure_put(&d->c->caching); | 714 | closure_put(&d->c->caching); |
@@ -743,13 +747,35 @@ static void bcache_device_free(struct bcache_device *d) | |||
743 | mempool_destroy(d->unaligned_bvec); | 747 | mempool_destroy(d->unaligned_bvec); |
744 | if (d->bio_split) | 748 | if (d->bio_split) |
745 | bioset_free(d->bio_split); | 749 | bioset_free(d->bio_split); |
750 | if (is_vmalloc_addr(d->stripe_sectors_dirty)) | ||
751 | vfree(d->stripe_sectors_dirty); | ||
752 | else | ||
753 | kfree(d->stripe_sectors_dirty); | ||
746 | 754 | ||
747 | closure_debug_destroy(&d->cl); | 755 | closure_debug_destroy(&d->cl); |
748 | } | 756 | } |
749 | 757 | ||
750 | static int bcache_device_init(struct bcache_device *d, unsigned block_size) | 758 | static int bcache_device_init(struct bcache_device *d, unsigned block_size, |
759 | sector_t sectors) | ||
751 | { | 760 | { |
752 | struct request_queue *q; | 761 | struct request_queue *q; |
762 | size_t n; | ||
763 | |||
764 | if (!d->stripe_size_bits) | ||
765 | d->stripe_size_bits = 31; | ||
766 | |||
767 | d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >> | ||
768 | d->stripe_size_bits; | ||
769 | |||
770 | if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) | ||
771 | return -ENOMEM; | ||
772 | |||
773 | n = d->nr_stripes * sizeof(atomic_t); | ||
774 | d->stripe_sectors_dirty = n < PAGE_SIZE << 6 | ||
775 | ? kzalloc(n, GFP_KERNEL) | ||
776 | : vzalloc(n); | ||
777 | if (!d->stripe_sectors_dirty) | ||
778 | return -ENOMEM; | ||
753 | 779 | ||
754 | if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || | 780 | if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || |
755 | !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, | 781 | !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, |
@@ -759,6 +785,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) | |||
759 | !(q = blk_alloc_queue(GFP_KERNEL))) | 785 | !(q = blk_alloc_queue(GFP_KERNEL))) |
760 | return -ENOMEM; | 786 | return -ENOMEM; |
761 | 787 | ||
788 | set_capacity(d->disk, sectors); | ||
762 | snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); | 789 | snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); |
763 | 790 | ||
764 | d->disk->major = bcache_major; | 791 | d->disk->major = bcache_major; |
@@ -781,6 +808,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) | |||
781 | set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); | 808 | set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); |
782 | set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); | 809 | set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); |
783 | 810 | ||
811 | blk_queue_flush(q, REQ_FLUSH|REQ_FUA); | ||
812 | |||
784 | return 0; | 813 | return 0; |
785 | } | 814 | } |
786 | 815 | ||
@@ -800,6 +829,17 @@ static void calc_cached_dev_sectors(struct cache_set *c) | |||
800 | void bch_cached_dev_run(struct cached_dev *dc) | 829 | void bch_cached_dev_run(struct cached_dev *dc) |
801 | { | 830 | { |
802 | struct bcache_device *d = &dc->disk; | 831 | struct bcache_device *d = &dc->disk; |
832 | char buf[SB_LABEL_SIZE + 1]; | ||
833 | char *env[] = { | ||
834 | "DRIVER=bcache", | ||
835 | kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid), | ||
836 | NULL, | ||
837 | NULL, | ||
838 | }; | ||
839 | |||
840 | memcpy(buf, dc->sb.label, SB_LABEL_SIZE); | ||
841 | buf[SB_LABEL_SIZE] = '\0'; | ||
842 | env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); | ||
803 | 843 | ||
804 | if (atomic_xchg(&dc->running, 1)) | 844 | if (atomic_xchg(&dc->running, 1)) |
805 | return; | 845 | return; |
@@ -816,10 +856,12 @@ void bch_cached_dev_run(struct cached_dev *dc) | |||
816 | 856 | ||
817 | add_disk(d->disk); | 857 | add_disk(d->disk); |
818 | bd_link_disk_holder(dc->bdev, dc->disk.disk); | 858 | bd_link_disk_holder(dc->bdev, dc->disk.disk); |
819 | #if 0 | 859 | /* won't show up in the uevent file, use udevadm monitor -e instead |
820 | char *env[] = { "SYMLINK=label" , NULL }; | 860 | * only class / kset properties are persistent */ |
821 | kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); | 861 | kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); |
822 | #endif | 862 | kfree(env[1]); |
863 | kfree(env[2]); | ||
864 | |||
823 | if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || | 865 | if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || |
824 | sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) | 866 | sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) |
825 | pr_debug("error creating sysfs link"); | 867 | pr_debug("error creating sysfs link"); |
@@ -960,6 +1002,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) | |||
960 | atomic_set(&dc->count, 1); | 1002 | atomic_set(&dc->count, 1); |
961 | 1003 | ||
962 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { | 1004 | if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { |
1005 | bch_sectors_dirty_init(dc); | ||
963 | atomic_set(&dc->has_dirty, 1); | 1006 | atomic_set(&dc->has_dirty, 1); |
964 | atomic_inc(&dc->count); | 1007 | atomic_inc(&dc->count); |
965 | bch_writeback_queue(dc); | 1008 | bch_writeback_queue(dc); |
@@ -1014,6 +1057,14 @@ static void cached_dev_flush(struct closure *cl) | |||
1014 | struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); | 1057 | struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); |
1015 | struct bcache_device *d = &dc->disk; | 1058 | struct bcache_device *d = &dc->disk; |
1016 | 1059 | ||
1060 | mutex_lock(&bch_register_lock); | ||
1061 | d->flush_done = 1; | ||
1062 | |||
1063 | if (d->c) | ||
1064 | bcache_device_unlink(d); | ||
1065 | |||
1066 | mutex_unlock(&bch_register_lock); | ||
1067 | |||
1017 | bch_cache_accounting_destroy(&dc->accounting); | 1068 | bch_cache_accounting_destroy(&dc->accounting); |
1018 | kobject_del(&d->kobj); | 1069 | kobject_del(&d->kobj); |
1019 | 1070 | ||
@@ -1045,7 +1096,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) | |||
1045 | hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); | 1096 | hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); |
1046 | } | 1097 | } |
1047 | 1098 | ||
1048 | ret = bcache_device_init(&dc->disk, block_size); | 1099 | ret = bcache_device_init(&dc->disk, block_size, |
1100 | dc->bdev->bd_part->nr_sects - dc->sb.data_offset); | ||
1049 | if (ret) | 1101 | if (ret) |
1050 | return ret; | 1102 | return ret; |
1051 | 1103 | ||
@@ -1144,11 +1196,10 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) | |||
1144 | 1196 | ||
1145 | kobject_init(&d->kobj, &bch_flash_dev_ktype); | 1197 | kobject_init(&d->kobj, &bch_flash_dev_ktype); |
1146 | 1198 | ||
1147 | if (bcache_device_init(d, block_bytes(c))) | 1199 | if (bcache_device_init(d, block_bytes(c), u->sectors)) |
1148 | goto err; | 1200 | goto err; |
1149 | 1201 | ||
1150 | bcache_device_attach(d, c, u - c->uuids); | 1202 | bcache_device_attach(d, c, u - c->uuids); |
1151 | set_capacity(d->disk, u->sectors); | ||
1152 | bch_flash_dev_request_init(d); | 1203 | bch_flash_dev_request_init(d); |
1153 | add_disk(d->disk); | 1204 | add_disk(d->disk); |
1154 | 1205 | ||
@@ -1255,9 +1306,10 @@ static void cache_set_free(struct closure *cl) | |||
1255 | free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); | 1306 | free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); |
1256 | free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); | 1307 | free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); |
1257 | 1308 | ||
1258 | kfree(c->fill_iter); | ||
1259 | if (c->bio_split) | 1309 | if (c->bio_split) |
1260 | bioset_free(c->bio_split); | 1310 | bioset_free(c->bio_split); |
1311 | if (c->fill_iter) | ||
1312 | mempool_destroy(c->fill_iter); | ||
1261 | if (c->bio_meta) | 1313 | if (c->bio_meta) |
1262 | mempool_destroy(c->bio_meta); | 1314 | mempool_destroy(c->bio_meta); |
1263 | if (c->search) | 1315 | if (c->search) |
@@ -1278,11 +1330,9 @@ static void cache_set_free(struct closure *cl) | |||
1278 | static void cache_set_flush(struct closure *cl) | 1330 | static void cache_set_flush(struct closure *cl) |
1279 | { | 1331 | { |
1280 | struct cache_set *c = container_of(cl, struct cache_set, caching); | 1332 | struct cache_set *c = container_of(cl, struct cache_set, caching); |
1333 | struct cache *ca; | ||
1281 | struct btree *b; | 1334 | struct btree *b; |
1282 | 1335 | unsigned i; | |
1283 | /* Shut down allocator threads */ | ||
1284 | set_bit(CACHE_SET_STOPPING_2, &c->flags); | ||
1285 | wake_up(&c->alloc_wait); | ||
1286 | 1336 | ||
1287 | bch_cache_accounting_destroy(&c->accounting); | 1337 | bch_cache_accounting_destroy(&c->accounting); |
1288 | 1338 | ||
@@ -1295,7 +1345,11 @@ static void cache_set_flush(struct closure *cl) | |||
1295 | /* Should skip this if we're unregistering because of an error */ | 1345 | /* Should skip this if we're unregistering because of an error */ |
1296 | list_for_each_entry(b, &c->btree_cache, list) | 1346 | list_for_each_entry(b, &c->btree_cache, list) |
1297 | if (btree_node_dirty(b)) | 1347 | if (btree_node_dirty(b)) |
1298 | bch_btree_write(b, true, NULL); | 1348 | bch_btree_node_write(b, NULL); |
1349 | |||
1350 | for_each_cache(ca, c, i) | ||
1351 | if (ca->alloc_thread) | ||
1352 | kthread_stop(ca->alloc_thread); | ||
1299 | 1353 | ||
1300 | closure_return(cl); | 1354 | closure_return(cl); |
1301 | } | 1355 | } |
@@ -1303,18 +1357,22 @@ static void cache_set_flush(struct closure *cl) | |||
1303 | static void __cache_set_unregister(struct closure *cl) | 1357 | static void __cache_set_unregister(struct closure *cl) |
1304 | { | 1358 | { |
1305 | struct cache_set *c = container_of(cl, struct cache_set, caching); | 1359 | struct cache_set *c = container_of(cl, struct cache_set, caching); |
1306 | struct cached_dev *dc, *t; | 1360 | struct cached_dev *dc; |
1307 | size_t i; | 1361 | size_t i; |
1308 | 1362 | ||
1309 | mutex_lock(&bch_register_lock); | 1363 | mutex_lock(&bch_register_lock); |
1310 | 1364 | ||
1311 | if (test_bit(CACHE_SET_UNREGISTERING, &c->flags)) | ||
1312 | list_for_each_entry_safe(dc, t, &c->cached_devs, list) | ||
1313 | bch_cached_dev_detach(dc); | ||
1314 | |||
1315 | for (i = 0; i < c->nr_uuids; i++) | 1365 | for (i = 0; i < c->nr_uuids; i++) |
1316 | if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i])) | 1366 | if (c->devices[i]) { |
1317 | bcache_device_stop(c->devices[i]); | 1367 | if (!UUID_FLASH_ONLY(&c->uuids[i]) && |
1368 | test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { | ||
1369 | dc = container_of(c->devices[i], | ||
1370 | struct cached_dev, disk); | ||
1371 | bch_cached_dev_detach(dc); | ||
1372 | } else { | ||
1373 | bcache_device_stop(c->devices[i]); | ||
1374 | } | ||
1375 | } | ||
1318 | 1376 | ||
1319 | mutex_unlock(&bch_register_lock); | 1377 | mutex_unlock(&bch_register_lock); |
1320 | 1378 | ||
@@ -1373,9 +1431,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
1373 | c->btree_pages = max_t(int, c->btree_pages / 4, | 1431 | c->btree_pages = max_t(int, c->btree_pages / 4, |
1374 | BTREE_MAX_PAGES); | 1432 | BTREE_MAX_PAGES); |
1375 | 1433 | ||
1376 | init_waitqueue_head(&c->alloc_wait); | 1434 | c->sort_crit_factor = int_sqrt(c->btree_pages); |
1435 | |||
1377 | mutex_init(&c->bucket_lock); | 1436 | mutex_init(&c->bucket_lock); |
1378 | mutex_init(&c->fill_lock); | ||
1379 | mutex_init(&c->sort_lock); | 1437 | mutex_init(&c->sort_lock); |
1380 | spin_lock_init(&c->sort_time_lock); | 1438 | spin_lock_init(&c->sort_time_lock); |
1381 | closure_init_unlocked(&c->sb_write); | 1439 | closure_init_unlocked(&c->sb_write); |
@@ -1401,8 +1459,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
1401 | !(c->bio_meta = mempool_create_kmalloc_pool(2, | 1459 | !(c->bio_meta = mempool_create_kmalloc_pool(2, |
1402 | sizeof(struct bbio) + sizeof(struct bio_vec) * | 1460 | sizeof(struct bbio) + sizeof(struct bio_vec) * |
1403 | bucket_pages(c))) || | 1461 | bucket_pages(c))) || |
1462 | !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || | ||
1404 | !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || | 1463 | !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || |
1405 | !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) || | ||
1406 | !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || | 1464 | !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || |
1407 | !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || | 1465 | !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || |
1408 | bch_journal_alloc(c) || | 1466 | bch_journal_alloc(c) || |
@@ -1410,8 +1468,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) | |||
1410 | bch_open_buckets_alloc(c)) | 1468 | bch_open_buckets_alloc(c)) |
1411 | goto err; | 1469 | goto err; |
1412 | 1470 | ||
1413 | c->fill_iter->size = sb->bucket_size / sb->block_size; | ||
1414 | |||
1415 | c->congested_read_threshold_us = 2000; | 1471 | c->congested_read_threshold_us = 2000; |
1416 | c->congested_write_threshold_us = 20000; | 1472 | c->congested_write_threshold_us = 20000; |
1417 | c->error_limit = 8 << IO_ERROR_SHIFT; | 1473 | c->error_limit = 8 << IO_ERROR_SHIFT; |
@@ -1496,9 +1552,10 @@ static void run_cache_set(struct cache_set *c) | |||
1496 | */ | 1552 | */ |
1497 | bch_journal_next(&c->journal); | 1553 | bch_journal_next(&c->journal); |
1498 | 1554 | ||
1555 | err = "error starting allocator thread"; | ||
1499 | for_each_cache(ca, c, i) | 1556 | for_each_cache(ca, c, i) |
1500 | closure_call(&ca->alloc, bch_allocator_thread, | 1557 | if (bch_cache_allocator_start(ca)) |
1501 | system_wq, &c->cl); | 1558 | goto err; |
1502 | 1559 | ||
1503 | /* | 1560 | /* |
1504 | * First place it's safe to allocate: btree_check() and | 1561 | * First place it's safe to allocate: btree_check() and |
@@ -1531,17 +1588,16 @@ static void run_cache_set(struct cache_set *c) | |||
1531 | 1588 | ||
1532 | bch_btree_gc_finish(c); | 1589 | bch_btree_gc_finish(c); |
1533 | 1590 | ||
1591 | err = "error starting allocator thread"; | ||
1534 | for_each_cache(ca, c, i) | 1592 | for_each_cache(ca, c, i) |
1535 | closure_call(&ca->alloc, bch_allocator_thread, | 1593 | if (bch_cache_allocator_start(ca)) |
1536 | ca->alloc_workqueue, &c->cl); | 1594 | goto err; |
1537 | 1595 | ||
1538 | mutex_lock(&c->bucket_lock); | 1596 | mutex_lock(&c->bucket_lock); |
1539 | for_each_cache(ca, c, i) | 1597 | for_each_cache(ca, c, i) |
1540 | bch_prio_write(ca); | 1598 | bch_prio_write(ca); |
1541 | mutex_unlock(&c->bucket_lock); | 1599 | mutex_unlock(&c->bucket_lock); |
1542 | 1600 | ||
1543 | wake_up(&c->alloc_wait); | ||
1544 | |||
1545 | err = "cannot allocate new UUID bucket"; | 1601 | err = "cannot allocate new UUID bucket"; |
1546 | if (__uuid_write(c)) | 1602 | if (__uuid_write(c)) |
1547 | goto err_unlock_gc; | 1603 | goto err_unlock_gc; |
@@ -1552,7 +1608,7 @@ static void run_cache_set(struct cache_set *c) | |||
1552 | goto err_unlock_gc; | 1608 | goto err_unlock_gc; |
1553 | 1609 | ||
1554 | bkey_copy_key(&c->root->key, &MAX_KEY); | 1610 | bkey_copy_key(&c->root->key, &MAX_KEY); |
1555 | bch_btree_write(c->root, true, &op); | 1611 | bch_btree_node_write(c->root, &op.cl); |
1556 | 1612 | ||
1557 | bch_btree_set_root(c->root); | 1613 | bch_btree_set_root(c->root); |
1558 | rw_unlock(true, c->root); | 1614 | rw_unlock(true, c->root); |
@@ -1673,9 +1729,6 @@ void bch_cache_release(struct kobject *kobj) | |||
1673 | 1729 | ||
1674 | bio_split_pool_free(&ca->bio_split_hook); | 1730 | bio_split_pool_free(&ca->bio_split_hook); |
1675 | 1731 | ||
1676 | if (ca->alloc_workqueue) | ||
1677 | destroy_workqueue(ca->alloc_workqueue); | ||
1678 | |||
1679 | free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); | 1732 | free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); |
1680 | kfree(ca->prio_buckets); | 1733 | kfree(ca->prio_buckets); |
1681 | vfree(ca->buckets); | 1734 | vfree(ca->buckets); |
@@ -1723,7 +1776,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) | |||
1723 | !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * | 1776 | !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * |
1724 | 2, GFP_KERNEL)) || | 1777 | 2, GFP_KERNEL)) || |
1725 | !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || | 1778 | !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || |
1726 | !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) || | ||
1727 | bio_split_pool_init(&ca->bio_split_hook)) | 1779 | bio_split_pool_init(&ca->bio_split_hook)) |
1728 | return -ENOMEM; | 1780 | return -ENOMEM; |
1729 | 1781 | ||
@@ -1786,6 +1838,36 @@ static ssize_t register_bcache(struct kobject *, struct kobj_attribute *, | |||
1786 | kobj_attribute_write(register, register_bcache); | 1838 | kobj_attribute_write(register, register_bcache); |
1787 | kobj_attribute_write(register_quiet, register_bcache); | 1839 | kobj_attribute_write(register_quiet, register_bcache); |
1788 | 1840 | ||
1841 | static bool bch_is_open_backing(struct block_device *bdev) { | ||
1842 | struct cache_set *c, *tc; | ||
1843 | struct cached_dev *dc, *t; | ||
1844 | |||
1845 | list_for_each_entry_safe(c, tc, &bch_cache_sets, list) | ||
1846 | list_for_each_entry_safe(dc, t, &c->cached_devs, list) | ||
1847 | if (dc->bdev == bdev) | ||
1848 | return true; | ||
1849 | list_for_each_entry_safe(dc, t, &uncached_devices, list) | ||
1850 | if (dc->bdev == bdev) | ||
1851 | return true; | ||
1852 | return false; | ||
1853 | } | ||
1854 | |||
1855 | static bool bch_is_open_cache(struct block_device *bdev) { | ||
1856 | struct cache_set *c, *tc; | ||
1857 | struct cache *ca; | ||
1858 | unsigned i; | ||
1859 | |||
1860 | list_for_each_entry_safe(c, tc, &bch_cache_sets, list) | ||
1861 | for_each_cache(ca, c, i) | ||
1862 | if (ca->bdev == bdev) | ||
1863 | return true; | ||
1864 | return false; | ||
1865 | } | ||
1866 | |||
1867 | static bool bch_is_open(struct block_device *bdev) { | ||
1868 | return bch_is_open_cache(bdev) || bch_is_open_backing(bdev); | ||
1869 | } | ||
1870 | |||
1789 | static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, | 1871 | static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, |
1790 | const char *buffer, size_t size) | 1872 | const char *buffer, size_t size) |
1791 | { | 1873 | { |
@@ -1810,8 +1892,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, | |||
1810 | FMODE_READ|FMODE_WRITE|FMODE_EXCL, | 1892 | FMODE_READ|FMODE_WRITE|FMODE_EXCL, |
1811 | sb); | 1893 | sb); |
1812 | if (IS_ERR(bdev)) { | 1894 | if (IS_ERR(bdev)) { |
1813 | if (bdev == ERR_PTR(-EBUSY)) | 1895 | if (bdev == ERR_PTR(-EBUSY)) { |
1814 | err = "device busy"; | 1896 | bdev = lookup_bdev(strim(path)); |
1897 | if (!IS_ERR(bdev) && bch_is_open(bdev)) | ||
1898 | err = "device already registered"; | ||
1899 | else | ||
1900 | err = "device busy"; | ||
1901 | } | ||
1815 | goto err; | 1902 | goto err; |
1816 | } | 1903 | } |
1817 | 1904 | ||
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4d9cca47e4c6..12a2c2846f99 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c | |||
@@ -9,7 +9,9 @@ | |||
9 | #include "sysfs.h" | 9 | #include "sysfs.h" |
10 | #include "btree.h" | 10 | #include "btree.h" |
11 | #include "request.h" | 11 | #include "request.h" |
12 | #include "writeback.h" | ||
12 | 13 | ||
14 | #include <linux/blkdev.h> | ||
13 | #include <linux/sort.h> | 15 | #include <linux/sort.h> |
14 | 16 | ||
15 | static const char * const cache_replacement_policies[] = { | 17 | static const char * const cache_replacement_policies[] = { |
@@ -79,6 +81,9 @@ rw_attribute(writeback_rate_p_term_inverse); | |||
79 | rw_attribute(writeback_rate_d_smooth); | 81 | rw_attribute(writeback_rate_d_smooth); |
80 | read_attribute(writeback_rate_debug); | 82 | read_attribute(writeback_rate_debug); |
81 | 83 | ||
84 | read_attribute(stripe_size); | ||
85 | read_attribute(partial_stripes_expensive); | ||
86 | |||
82 | rw_attribute(synchronous); | 87 | rw_attribute(synchronous); |
83 | rw_attribute(journal_delay_ms); | 88 | rw_attribute(journal_delay_ms); |
84 | rw_attribute(discard); | 89 | rw_attribute(discard); |
@@ -127,7 +132,7 @@ SHOW(__bch_cached_dev) | |||
127 | char derivative[20]; | 132 | char derivative[20]; |
128 | char target[20]; | 133 | char target[20]; |
129 | bch_hprint(dirty, | 134 | bch_hprint(dirty, |
130 | atomic_long_read(&dc->disk.sectors_dirty) << 9); | 135 | bcache_dev_sectors_dirty(&dc->disk) << 9); |
131 | bch_hprint(derivative, dc->writeback_rate_derivative << 9); | 136 | bch_hprint(derivative, dc->writeback_rate_derivative << 9); |
132 | bch_hprint(target, dc->writeback_rate_target << 9); | 137 | bch_hprint(target, dc->writeback_rate_target << 9); |
133 | 138 | ||
@@ -143,7 +148,10 @@ SHOW(__bch_cached_dev) | |||
143 | } | 148 | } |
144 | 149 | ||
145 | sysfs_hprint(dirty_data, | 150 | sysfs_hprint(dirty_data, |
146 | atomic_long_read(&dc->disk.sectors_dirty) << 9); | 151 | bcache_dev_sectors_dirty(&dc->disk) << 9); |
152 | |||
153 | sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9); | ||
154 | var_printf(partial_stripes_expensive, "%u"); | ||
147 | 155 | ||
148 | var_printf(sequential_merge, "%i"); | 156 | var_printf(sequential_merge, "%i"); |
149 | var_hprint(sequential_cutoff); | 157 | var_hprint(sequential_cutoff); |
@@ -170,6 +178,7 @@ STORE(__cached_dev) | |||
170 | disk.kobj); | 178 | disk.kobj); |
171 | unsigned v = size; | 179 | unsigned v = size; |
172 | struct cache_set *c; | 180 | struct cache_set *c; |
181 | struct kobj_uevent_env *env; | ||
173 | 182 | ||
174 | #define d_strtoul(var) sysfs_strtoul(var, dc->var) | 183 | #define d_strtoul(var) sysfs_strtoul(var, dc->var) |
175 | #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) | 184 | #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) |
@@ -214,6 +223,7 @@ STORE(__cached_dev) | |||
214 | } | 223 | } |
215 | 224 | ||
216 | if (attr == &sysfs_label) { | 225 | if (attr == &sysfs_label) { |
226 | /* note: endlines are preserved */ | ||
217 | memcpy(dc->sb.label, buf, SB_LABEL_SIZE); | 227 | memcpy(dc->sb.label, buf, SB_LABEL_SIZE); |
218 | bch_write_bdev_super(dc, NULL); | 228 | bch_write_bdev_super(dc, NULL); |
219 | if (dc->disk.c) { | 229 | if (dc->disk.c) { |
@@ -221,6 +231,15 @@ STORE(__cached_dev) | |||
221 | buf, SB_LABEL_SIZE); | 231 | buf, SB_LABEL_SIZE); |
222 | bch_uuid_write(dc->disk.c); | 232 | bch_uuid_write(dc->disk.c); |
223 | } | 233 | } |
234 | env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); | ||
235 | if (!env) | ||
236 | return -ENOMEM; | ||
237 | add_uevent_var(env, "DRIVER=bcache"); | ||
238 | add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid), | ||
239 | add_uevent_var(env, "CACHED_LABEL=%s", buf); | ||
240 | kobject_uevent_env( | ||
241 | &disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp); | ||
242 | kfree(env); | ||
224 | } | 243 | } |
225 | 244 | ||
226 | if (attr == &sysfs_attach) { | 245 | if (attr == &sysfs_attach) { |
@@ -284,6 +303,8 @@ static struct attribute *bch_cached_dev_files[] = { | |||
284 | &sysfs_writeback_rate_d_smooth, | 303 | &sysfs_writeback_rate_d_smooth, |
285 | &sysfs_writeback_rate_debug, | 304 | &sysfs_writeback_rate_debug, |
286 | &sysfs_dirty_data, | 305 | &sysfs_dirty_data, |
306 | &sysfs_stripe_size, | ||
307 | &sysfs_partial_stripes_expensive, | ||
287 | &sysfs_sequential_cutoff, | 308 | &sysfs_sequential_cutoff, |
288 | &sysfs_sequential_merge, | 309 | &sysfs_sequential_merge, |
289 | &sysfs_clear_stats, | 310 | &sysfs_clear_stats, |
@@ -665,12 +686,10 @@ SHOW(__bch_cache) | |||
665 | int cmp(const void *l, const void *r) | 686 | int cmp(const void *l, const void *r) |
666 | { return *((uint16_t *) r) - *((uint16_t *) l); } | 687 | { return *((uint16_t *) r) - *((uint16_t *) l); } |
667 | 688 | ||
668 | /* Number of quantiles we compute */ | ||
669 | const unsigned nq = 31; | ||
670 | |||
671 | size_t n = ca->sb.nbuckets, i, unused, btree; | 689 | size_t n = ca->sb.nbuckets, i, unused, btree; |
672 | uint64_t sum = 0; | 690 | uint64_t sum = 0; |
673 | uint16_t q[nq], *p, *cached; | 691 | /* Compute 31 quantiles */ |
692 | uint16_t q[31], *p, *cached; | ||
674 | ssize_t ret; | 693 | ssize_t ret; |
675 | 694 | ||
676 | cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); | 695 | cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); |
@@ -703,26 +722,29 @@ SHOW(__bch_cache) | |||
703 | if (n) | 722 | if (n) |
704 | do_div(sum, n); | 723 | do_div(sum, n); |
705 | 724 | ||
706 | for (i = 0; i < nq; i++) | 725 | for (i = 0; i < ARRAY_SIZE(q); i++) |
707 | q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)]; | 726 | q[i] = INITIAL_PRIO - cached[n * (i + 1) / |
727 | (ARRAY_SIZE(q) + 1)]; | ||
708 | 728 | ||
709 | vfree(p); | 729 | vfree(p); |
710 | 730 | ||
711 | ret = snprintf(buf, PAGE_SIZE, | 731 | ret = scnprintf(buf, PAGE_SIZE, |
712 | "Unused: %zu%%\n" | 732 | "Unused: %zu%%\n" |
713 | "Metadata: %zu%%\n" | 733 | "Metadata: %zu%%\n" |
714 | "Average: %llu\n" | 734 | "Average: %llu\n" |
715 | "Sectors per Q: %zu\n" | 735 | "Sectors per Q: %zu\n" |
716 | "Quantiles: [", | 736 | "Quantiles: [", |
717 | unused * 100 / (size_t) ca->sb.nbuckets, | 737 | unused * 100 / (size_t) ca->sb.nbuckets, |
718 | btree * 100 / (size_t) ca->sb.nbuckets, sum, | 738 | btree * 100 / (size_t) ca->sb.nbuckets, sum, |
719 | n * ca->sb.bucket_size / (nq + 1)); | 739 | n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); |
720 | 740 | ||
721 | for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++) | 741 | for (i = 0; i < ARRAY_SIZE(q); i++) |
722 | ret += snprintf(buf + ret, PAGE_SIZE - ret, | 742 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, |
723 | i < nq - 1 ? "%u " : "%u]\n", q[i]); | 743 | "%u ", q[i]); |
724 | 744 | ret--; | |
725 | buf[PAGE_SIZE - 1] = '\0'; | 745 | |
746 | ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n"); | ||
747 | |||
726 | return ret; | 748 | return ret; |
727 | } | 749 | } |
728 | 750 | ||
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index 983f9bb411bc..f7b6c197f90f 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c | |||
@@ -2,6 +2,7 @@ | |||
2 | #include "btree.h" | 2 | #include "btree.h" |
3 | #include "request.h" | 3 | #include "request.h" |
4 | 4 | ||
5 | #include <linux/blktrace_api.h> | ||
5 | #include <linux/module.h> | 6 | #include <linux/module.h> |
6 | 7 | ||
7 | #define CREATE_TRACE_POINTS | 8 | #define CREATE_TRACE_POINTS |
@@ -9,18 +10,44 @@ | |||
9 | 10 | ||
10 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); | 11 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); |
11 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); | 12 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); |
12 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough); | 13 | |
13 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit); | 14 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential); |
14 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss); | 15 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested); |
16 | |||
17 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read); | ||
18 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write); | ||
15 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); | 19 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); |
16 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough); | 20 | |
17 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); | 21 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); |
18 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip); | 22 | |
23 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key); | ||
24 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); | ||
25 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full); | ||
26 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full); | ||
27 | |||
28 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize); | ||
29 | |||
19 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); | 30 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); |
20 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); | 31 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); |
21 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty); | 32 | |
22 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty); | 33 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc); |
23 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); | 34 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail); |
24 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); | 35 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free); |
36 | |||
37 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce); | ||
25 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); | 38 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); |
26 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); | 39 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); |
40 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy); | ||
41 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision); | ||
42 | |||
43 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key); | ||
44 | |||
45 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split); | ||
46 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); | ||
47 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); | ||
48 | |||
49 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate); | ||
50 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail); | ||
51 | |||
52 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); | ||
53 | EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision); | ||
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index da3a99e85b1e..98eb81159a22 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c | |||
@@ -228,23 +228,6 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, | |||
228 | } | 228 | } |
229 | } | 229 | } |
230 | 230 | ||
231 | int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp) | ||
232 | { | ||
233 | int i; | ||
234 | struct bio_vec *bv; | ||
235 | |||
236 | bio_for_each_segment(bv, bio, i) { | ||
237 | bv->bv_page = alloc_page(gfp); | ||
238 | if (!bv->bv_page) { | ||
239 | while (bv-- != bio->bi_io_vec + bio->bi_idx) | ||
240 | __free_page(bv->bv_page); | ||
241 | return -ENOMEM; | ||
242 | } | ||
243 | } | ||
244 | |||
245 | return 0; | ||
246 | } | ||
247 | |||
248 | /* | 231 | /* |
249 | * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any | 232 | * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any |
250 | * use permitted, subject to terms of PostgreSQL license; see.) | 233 | * use permitted, subject to terms of PostgreSQL license; see.) |
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 577393e38c3a..1ae2a73ad85f 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h | |||
@@ -15,8 +15,6 @@ | |||
15 | 15 | ||
16 | struct closure; | 16 | struct closure; |
17 | 17 | ||
18 | #include <trace/events/bcache.h> | ||
19 | |||
20 | #ifdef CONFIG_BCACHE_EDEBUG | 18 | #ifdef CONFIG_BCACHE_EDEBUG |
21 | 19 | ||
22 | #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) | 20 | #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) |
@@ -566,12 +564,8 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) | |||
566 | return x; | 564 | return x; |
567 | } | 565 | } |
568 | 566 | ||
569 | #define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio)) | ||
570 | |||
571 | void bch_bio_map(struct bio *bio, void *base); | 567 | void bch_bio_map(struct bio *bio, void *base); |
572 | 568 | ||
573 | int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp); | ||
574 | |||
575 | static inline sector_t bdev_sectors(struct block_device *bdev) | 569 | static inline sector_t bdev_sectors(struct block_device *bdev) |
576 | { | 570 | { |
577 | return bdev->bd_inode->i_size >> 9; | 571 | return bdev->bd_inode->i_size >> 9; |
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 2714ed3991d1..22cbff551628 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c | |||
@@ -9,6 +9,9 @@ | |||
9 | #include "bcache.h" | 9 | #include "bcache.h" |
10 | #include "btree.h" | 10 | #include "btree.h" |
11 | #include "debug.h" | 11 | #include "debug.h" |
12 | #include "writeback.h" | ||
13 | |||
14 | #include <trace/events/bcache.h> | ||
12 | 15 | ||
13 | static struct workqueue_struct *dirty_wq; | 16 | static struct workqueue_struct *dirty_wq; |
14 | 17 | ||
@@ -36,7 +39,7 @@ static void __update_writeback_rate(struct cached_dev *dc) | |||
36 | 39 | ||
37 | int change = 0; | 40 | int change = 0; |
38 | int64_t error; | 41 | int64_t error; |
39 | int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty); | 42 | int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); |
40 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; | 43 | int64_t derivative = dirty - dc->disk.sectors_dirty_last; |
41 | 44 | ||
42 | dc->disk.sectors_dirty_last = dirty; | 45 | dc->disk.sectors_dirty_last = dirty; |
@@ -105,6 +108,31 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) | |||
105 | return KEY_DIRTY(k); | 108 | return KEY_DIRTY(k); |
106 | } | 109 | } |
107 | 110 | ||
111 | static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) | ||
112 | { | ||
113 | uint64_t stripe; | ||
114 | unsigned nr_sectors = KEY_SIZE(k); | ||
115 | struct cached_dev *dc = container_of(buf, struct cached_dev, | ||
116 | writeback_keys); | ||
117 | unsigned stripe_size = 1 << dc->disk.stripe_size_bits; | ||
118 | |||
119 | if (!KEY_DIRTY(k)) | ||
120 | return false; | ||
121 | |||
122 | stripe = KEY_START(k) >> dc->disk.stripe_size_bits; | ||
123 | while (1) { | ||
124 | if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != | ||
125 | stripe_size) | ||
126 | return false; | ||
127 | |||
128 | if (nr_sectors <= stripe_size) | ||
129 | return true; | ||
130 | |||
131 | nr_sectors -= stripe_size; | ||
132 | stripe++; | ||
133 | } | ||
134 | } | ||
135 | |||
108 | static void dirty_init(struct keybuf_key *w) | 136 | static void dirty_init(struct keybuf_key *w) |
109 | { | 137 | { |
110 | struct dirty_io *io = w->private; | 138 | struct dirty_io *io = w->private; |
@@ -149,7 +177,22 @@ static void refill_dirty(struct closure *cl) | |||
149 | searched_from_start = true; | 177 | searched_from_start = true; |
150 | } | 178 | } |
151 | 179 | ||
152 | bch_refill_keybuf(dc->disk.c, buf, &end); | 180 | if (dc->partial_stripes_expensive) { |
181 | uint64_t i; | ||
182 | |||
183 | for (i = 0; i < dc->disk.nr_stripes; i++) | ||
184 | if (atomic_read(dc->disk.stripe_sectors_dirty + i) == | ||
185 | 1 << dc->disk.stripe_size_bits) | ||
186 | goto full_stripes; | ||
187 | |||
188 | goto normal_refill; | ||
189 | full_stripes: | ||
190 | bch_refill_keybuf(dc->disk.c, buf, &end, | ||
191 | dirty_full_stripe_pred); | ||
192 | } else { | ||
193 | normal_refill: | ||
194 | bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); | ||
195 | } | ||
153 | 196 | ||
154 | if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { | 197 | if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { |
155 | /* Searched the entire btree - delay awhile */ | 198 | /* Searched the entire btree - delay awhile */ |
@@ -181,10 +224,8 @@ void bch_writeback_queue(struct cached_dev *dc) | |||
181 | } | 224 | } |
182 | } | 225 | } |
183 | 226 | ||
184 | void bch_writeback_add(struct cached_dev *dc, unsigned sectors) | 227 | void bch_writeback_add(struct cached_dev *dc) |
185 | { | 228 | { |
186 | atomic_long_add(sectors, &dc->disk.sectors_dirty); | ||
187 | |||
188 | if (!atomic_read(&dc->has_dirty) && | 229 | if (!atomic_read(&dc->has_dirty) && |
189 | !atomic_xchg(&dc->has_dirty, 1)) { | 230 | !atomic_xchg(&dc->has_dirty, 1)) { |
190 | atomic_inc(&dc->count); | 231 | atomic_inc(&dc->count); |
@@ -203,6 +244,34 @@ void bch_writeback_add(struct cached_dev *dc, unsigned sectors) | |||
203 | } | 244 | } |
204 | } | 245 | } |
205 | 246 | ||
247 | void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, | ||
248 | uint64_t offset, int nr_sectors) | ||
249 | { | ||
250 | struct bcache_device *d = c->devices[inode]; | ||
251 | unsigned stripe_size, stripe_offset; | ||
252 | uint64_t stripe; | ||
253 | |||
254 | if (!d) | ||
255 | return; | ||
256 | |||
257 | stripe_size = 1 << d->stripe_size_bits; | ||
258 | stripe = offset >> d->stripe_size_bits; | ||
259 | stripe_offset = offset & (stripe_size - 1); | ||
260 | |||
261 | while (nr_sectors) { | ||
262 | int s = min_t(unsigned, abs(nr_sectors), | ||
263 | stripe_size - stripe_offset); | ||
264 | |||
265 | if (nr_sectors < 0) | ||
266 | s = -s; | ||
267 | |||
268 | atomic_add(s, d->stripe_sectors_dirty + stripe); | ||
269 | nr_sectors -= s; | ||
270 | stripe_offset = 0; | ||
271 | stripe++; | ||
272 | } | ||
273 | } | ||
274 | |||
206 | /* Background writeback - IO loop */ | 275 | /* Background writeback - IO loop */ |
207 | 276 | ||
208 | static void dirty_io_destructor(struct closure *cl) | 277 | static void dirty_io_destructor(struct closure *cl) |
@@ -216,9 +285,10 @@ static void write_dirty_finish(struct closure *cl) | |||
216 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 285 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
217 | struct keybuf_key *w = io->bio.bi_private; | 286 | struct keybuf_key *w = io->bio.bi_private; |
218 | struct cached_dev *dc = io->dc; | 287 | struct cached_dev *dc = io->dc; |
219 | struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); | 288 | struct bio_vec *bv; |
289 | int i; | ||
220 | 290 | ||
221 | while (bv-- != io->bio.bi_io_vec) | 291 | bio_for_each_segment_all(bv, &io->bio, i) |
222 | __free_page(bv->bv_page); | 292 | __free_page(bv->bv_page); |
223 | 293 | ||
224 | /* This is kind of a dumb way of signalling errors. */ | 294 | /* This is kind of a dumb way of signalling errors. */ |
@@ -236,10 +306,12 @@ static void write_dirty_finish(struct closure *cl) | |||
236 | for (i = 0; i < KEY_PTRS(&w->key); i++) | 306 | for (i = 0; i < KEY_PTRS(&w->key); i++) |
237 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); | 307 | atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); |
238 | 308 | ||
239 | pr_debug("clearing %s", pkey(&w->key)); | ||
240 | bch_btree_insert(&op, dc->disk.c); | 309 | bch_btree_insert(&op, dc->disk.c); |
241 | closure_sync(&op.cl); | 310 | closure_sync(&op.cl); |
242 | 311 | ||
312 | if (op.insert_collision) | ||
313 | trace_bcache_writeback_collision(&w->key); | ||
314 | |||
243 | atomic_long_inc(op.insert_collision | 315 | atomic_long_inc(op.insert_collision |
244 | ? &dc->disk.c->writeback_keys_failed | 316 | ? &dc->disk.c->writeback_keys_failed |
245 | : &dc->disk.c->writeback_keys_done); | 317 | : &dc->disk.c->writeback_keys_done); |
@@ -275,7 +347,6 @@ static void write_dirty(struct closure *cl) | |||
275 | io->bio.bi_bdev = io->dc->bdev; | 347 | io->bio.bi_bdev = io->dc->bdev; |
276 | io->bio.bi_end_io = dirty_endio; | 348 | io->bio.bi_end_io = dirty_endio; |
277 | 349 | ||
278 | trace_bcache_write_dirty(&io->bio); | ||
279 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | 350 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
280 | 351 | ||
281 | continue_at(cl, write_dirty_finish, dirty_wq); | 352 | continue_at(cl, write_dirty_finish, dirty_wq); |
@@ -296,7 +367,6 @@ static void read_dirty_submit(struct closure *cl) | |||
296 | { | 367 | { |
297 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); | 368 | struct dirty_io *io = container_of(cl, struct dirty_io, cl); |
298 | 369 | ||
299 | trace_bcache_read_dirty(&io->bio); | ||
300 | closure_bio_submit(&io->bio, cl, &io->dc->disk); | 370 | closure_bio_submit(&io->bio, cl, &io->dc->disk); |
301 | 371 | ||
302 | continue_at(cl, write_dirty, dirty_wq); | 372 | continue_at(cl, write_dirty, dirty_wq); |
@@ -349,10 +419,10 @@ static void read_dirty(struct closure *cl) | |||
349 | io->bio.bi_rw = READ; | 419 | io->bio.bi_rw = READ; |
350 | io->bio.bi_end_io = read_dirty_endio; | 420 | io->bio.bi_end_io = read_dirty_endio; |
351 | 421 | ||
352 | if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) | 422 | if (bio_alloc_pages(&io->bio, GFP_KERNEL)) |
353 | goto err_free; | 423 | goto err_free; |
354 | 424 | ||
355 | pr_debug("%s", pkey(&w->key)); | 425 | trace_bcache_writeback(&w->key); |
356 | 426 | ||
357 | closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); | 427 | closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); |
358 | 428 | ||
@@ -375,12 +445,49 @@ err: | |||
375 | refill_dirty(cl); | 445 | refill_dirty(cl); |
376 | } | 446 | } |
377 | 447 | ||
448 | /* Init */ | ||
449 | |||
450 | static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, | ||
451 | struct cached_dev *dc) | ||
452 | { | ||
453 | struct bkey *k; | ||
454 | struct btree_iter iter; | ||
455 | |||
456 | bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); | ||
457 | while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) | ||
458 | if (!b->level) { | ||
459 | if (KEY_INODE(k) > dc->disk.id) | ||
460 | break; | ||
461 | |||
462 | if (KEY_DIRTY(k)) | ||
463 | bcache_dev_sectors_dirty_add(b->c, dc->disk.id, | ||
464 | KEY_START(k), | ||
465 | KEY_SIZE(k)); | ||
466 | } else { | ||
467 | btree(sectors_dirty_init, k, b, op, dc); | ||
468 | if (KEY_INODE(k) > dc->disk.id) | ||
469 | break; | ||
470 | |||
471 | cond_resched(); | ||
472 | } | ||
473 | |||
474 | return 0; | ||
475 | } | ||
476 | |||
477 | void bch_sectors_dirty_init(struct cached_dev *dc) | ||
478 | { | ||
479 | struct btree_op op; | ||
480 | |||
481 | bch_btree_op_init_stack(&op); | ||
482 | btree_root(sectors_dirty_init, dc->disk.c, &op, dc); | ||
483 | } | ||
484 | |||
378 | void bch_cached_dev_writeback_init(struct cached_dev *dc) | 485 | void bch_cached_dev_writeback_init(struct cached_dev *dc) |
379 | { | 486 | { |
380 | closure_init_unlocked(&dc->writeback); | 487 | closure_init_unlocked(&dc->writeback); |
381 | init_rwsem(&dc->writeback_lock); | 488 | init_rwsem(&dc->writeback_lock); |
382 | 489 | ||
383 | bch_keybuf_init(&dc->writeback_keys, dirty_pred); | 490 | bch_keybuf_init(&dc->writeback_keys); |
384 | 491 | ||
385 | dc->writeback_metadata = true; | 492 | dc->writeback_metadata = true; |
386 | dc->writeback_running = true; | 493 | dc->writeback_running = true; |
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h new file mode 100644 index 000000000000..c91f61bb95b6 --- /dev/null +++ b/drivers/md/bcache/writeback.h | |||
@@ -0,0 +1,64 @@ | |||
1 | #ifndef _BCACHE_WRITEBACK_H | ||
2 | #define _BCACHE_WRITEBACK_H | ||
3 | |||
4 | #define CUTOFF_WRITEBACK 40 | ||
5 | #define CUTOFF_WRITEBACK_SYNC 70 | ||
6 | |||
7 | static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) | ||
8 | { | ||
9 | uint64_t i, ret = 0; | ||
10 | |||
11 | for (i = 0; i < d->nr_stripes; i++) | ||
12 | ret += atomic_read(d->stripe_sectors_dirty + i); | ||
13 | |||
14 | return ret; | ||
15 | } | ||
16 | |||
17 | static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, | ||
18 | uint64_t offset, | ||
19 | unsigned nr_sectors) | ||
20 | { | ||
21 | uint64_t stripe = offset >> d->stripe_size_bits; | ||
22 | |||
23 | while (1) { | ||
24 | if (atomic_read(d->stripe_sectors_dirty + stripe)) | ||
25 | return true; | ||
26 | |||
27 | if (nr_sectors <= 1 << d->stripe_size_bits) | ||
28 | return false; | ||
29 | |||
30 | nr_sectors -= 1 << d->stripe_size_bits; | ||
31 | stripe++; | ||
32 | } | ||
33 | } | ||
34 | |||
35 | static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, | ||
36 | unsigned cache_mode, bool would_skip) | ||
37 | { | ||
38 | unsigned in_use = dc->disk.c->gc_stats.in_use; | ||
39 | |||
40 | if (cache_mode != CACHE_MODE_WRITEBACK || | ||
41 | atomic_read(&dc->disk.detaching) || | ||
42 | in_use > CUTOFF_WRITEBACK_SYNC) | ||
43 | return false; | ||
44 | |||
45 | if (dc->partial_stripes_expensive && | ||
46 | bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector, | ||
47 | bio_sectors(bio))) | ||
48 | return true; | ||
49 | |||
50 | if (would_skip) | ||
51 | return false; | ||
52 | |||
53 | return bio->bi_rw & REQ_SYNC || | ||
54 | in_use <= CUTOFF_WRITEBACK; | ||
55 | } | ||
56 | |||
57 | void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); | ||
58 | void bch_writeback_queue(struct cached_dev *); | ||
59 | void bch_writeback_add(struct cached_dev *); | ||
60 | |||
61 | void bch_sectors_dirty_init(struct cached_dev *dc); | ||
62 | void bch_cached_dev_writeback_init(struct cached_dev *); | ||
63 | |||
64 | #endif | ||
diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 1b4d4ee1168f..de7d74ab3de6 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h | |||
@@ -177,7 +177,11 @@ enum drbd_ret_code { | |||
177 | ERR_NEED_APV_100 = 163, | 177 | ERR_NEED_APV_100 = 163, |
178 | ERR_NEED_ALLOW_TWO_PRI = 164, | 178 | ERR_NEED_ALLOW_TWO_PRI = 164, |
179 | ERR_MD_UNCLEAN = 165, | 179 | ERR_MD_UNCLEAN = 165, |
180 | 180 | ERR_MD_LAYOUT_CONNECTED = 166, | |
181 | ERR_MD_LAYOUT_TOO_BIG = 167, | ||
182 | ERR_MD_LAYOUT_TOO_SMALL = 168, | ||
183 | ERR_MD_LAYOUT_NO_FIT = 169, | ||
184 | ERR_IMPLICIT_SHRINK = 170, | ||
181 | /* insert new ones above this line */ | 185 | /* insert new ones above this line */ |
182 | AFTER_LAST_ERR_CODE | 186 | AFTER_LAST_ERR_CODE |
183 | }; | 187 | }; |
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index d0d8fac8a6e4..e8c44572b8cb 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h | |||
@@ -181,6 +181,8 @@ GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms, | |||
181 | __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size) | 181 | __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size) |
182 | __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force) | 182 | __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force) |
183 | __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync) | 183 | __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync) |
184 | __u32_field_def(4, 0 /* OPTIONAL */, al_stripes, DRBD_AL_STRIPES_DEF) | ||
185 | __u32_field_def(5, 0 /* OPTIONAL */, al_stripe_size, DRBD_AL_STRIPE_SIZE_DEF) | ||
184 | ) | 186 | ) |
185 | 187 | ||
186 | GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, | 188 | GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, |
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 1fedf2b17cc8..17e50bb00521 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h | |||
@@ -215,4 +215,13 @@ | |||
215 | #define DRBD_ALWAYS_ASBP_DEF 0 | 215 | #define DRBD_ALWAYS_ASBP_DEF 0 |
216 | #define DRBD_USE_RLE_DEF 1 | 216 | #define DRBD_USE_RLE_DEF 1 |
217 | 217 | ||
218 | #define DRBD_AL_STRIPES_MIN 1 | ||
219 | #define DRBD_AL_STRIPES_MAX 1024 | ||
220 | #define DRBD_AL_STRIPES_DEF 1 | ||
221 | #define DRBD_AL_STRIPES_SCALE '1' | ||
222 | |||
223 | #define DRBD_AL_STRIPE_SIZE_MIN 4 | ||
224 | #define DRBD_AL_STRIPE_SIZE_MAX 16777216 | ||
225 | #define DRBD_AL_STRIPE_SIZE_DEF 32 | ||
226 | #define DRBD_AL_STRIPE_SIZE_SCALE 'k' /* kilobytes */ | ||
218 | #endif | 227 | #endif |
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 3cc5a0b278c3..5ebda976ea93 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h | |||
@@ -9,9 +9,7 @@ | |||
9 | struct search; | 9 | struct search; |
10 | 10 | ||
11 | DECLARE_EVENT_CLASS(bcache_request, | 11 | DECLARE_EVENT_CLASS(bcache_request, |
12 | |||
13 | TP_PROTO(struct search *s, struct bio *bio), | 12 | TP_PROTO(struct search *s, struct bio *bio), |
14 | |||
15 | TP_ARGS(s, bio), | 13 | TP_ARGS(s, bio), |
16 | 14 | ||
17 | TP_STRUCT__entry( | 15 | TP_STRUCT__entry( |
@@ -22,7 +20,6 @@ DECLARE_EVENT_CLASS(bcache_request, | |||
22 | __field(dev_t, orig_sector ) | 20 | __field(dev_t, orig_sector ) |
23 | __field(unsigned int, nr_sector ) | 21 | __field(unsigned int, nr_sector ) |
24 | __array(char, rwbs, 6 ) | 22 | __array(char, rwbs, 6 ) |
25 | __array(char, comm, TASK_COMM_LEN ) | ||
26 | ), | 23 | ), |
27 | 24 | ||
28 | TP_fast_assign( | 25 | TP_fast_assign( |
@@ -33,36 +30,66 @@ DECLARE_EVENT_CLASS(bcache_request, | |||
33 | __entry->orig_sector = bio->bi_sector - 16; | 30 | __entry->orig_sector = bio->bi_sector - 16; |
34 | __entry->nr_sector = bio->bi_size >> 9; | 31 | __entry->nr_sector = bio->bi_size >> 9; |
35 | blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); | 32 | blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); |
36 | memcpy(__entry->comm, current->comm, TASK_COMM_LEN); | ||
37 | ), | 33 | ), |
38 | 34 | ||
39 | TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)", | 35 | TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)", |
40 | MAJOR(__entry->dev), MINOR(__entry->dev), | 36 | MAJOR(__entry->dev), MINOR(__entry->dev), |
41 | __entry->rwbs, | 37 | __entry->rwbs, (unsigned long long)__entry->sector, |
42 | (unsigned long long)__entry->sector, | 38 | __entry->nr_sector, __entry->orig_major, __entry->orig_minor, |
43 | __entry->nr_sector, __entry->comm, | ||
44 | __entry->orig_major, __entry->orig_minor, | ||
45 | (unsigned long long)__entry->orig_sector) | 39 | (unsigned long long)__entry->orig_sector) |
46 | ); | 40 | ); |
47 | 41 | ||
48 | DEFINE_EVENT(bcache_request, bcache_request_start, | 42 | DECLARE_EVENT_CLASS(bkey, |
43 | TP_PROTO(struct bkey *k), | ||
44 | TP_ARGS(k), | ||
49 | 45 | ||
50 | TP_PROTO(struct search *s, struct bio *bio), | 46 | TP_STRUCT__entry( |
47 | __field(u32, size ) | ||
48 | __field(u32, inode ) | ||
49 | __field(u64, offset ) | ||
50 | __field(bool, dirty ) | ||
51 | ), | ||
51 | 52 | ||
52 | TP_ARGS(s, bio) | 53 | TP_fast_assign( |
54 | __entry->inode = KEY_INODE(k); | ||
55 | __entry->offset = KEY_OFFSET(k); | ||
56 | __entry->size = KEY_SIZE(k); | ||
57 | __entry->dirty = KEY_DIRTY(k); | ||
58 | ), | ||
59 | |||
60 | TP_printk("%u:%llu len %u dirty %u", __entry->inode, | ||
61 | __entry->offset, __entry->size, __entry->dirty) | ||
53 | ); | 62 | ); |
54 | 63 | ||
55 | DEFINE_EVENT(bcache_request, bcache_request_end, | 64 | DECLARE_EVENT_CLASS(btree_node, |
65 | TP_PROTO(struct btree *b), | ||
66 | TP_ARGS(b), | ||
67 | |||
68 | TP_STRUCT__entry( | ||
69 | __field(size_t, bucket ) | ||
70 | ), | ||
56 | 71 | ||
72 | TP_fast_assign( | ||
73 | __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0); | ||
74 | ), | ||
75 | |||
76 | TP_printk("bucket %zu", __entry->bucket) | ||
77 | ); | ||
78 | |||
79 | /* request.c */ | ||
80 | |||
81 | DEFINE_EVENT(bcache_request, bcache_request_start, | ||
57 | TP_PROTO(struct search *s, struct bio *bio), | 82 | TP_PROTO(struct search *s, struct bio *bio), |
83 | TP_ARGS(s, bio) | ||
84 | ); | ||
58 | 85 | ||
86 | DEFINE_EVENT(bcache_request, bcache_request_end, | ||
87 | TP_PROTO(struct search *s, struct bio *bio), | ||
59 | TP_ARGS(s, bio) | 88 | TP_ARGS(s, bio) |
60 | ); | 89 | ); |
61 | 90 | ||
62 | DECLARE_EVENT_CLASS(bcache_bio, | 91 | DECLARE_EVENT_CLASS(bcache_bio, |
63 | |||
64 | TP_PROTO(struct bio *bio), | 92 | TP_PROTO(struct bio *bio), |
65 | |||
66 | TP_ARGS(bio), | 93 | TP_ARGS(bio), |
67 | 94 | ||
68 | TP_STRUCT__entry( | 95 | TP_STRUCT__entry( |
@@ -70,7 +97,6 @@ DECLARE_EVENT_CLASS(bcache_bio, | |||
70 | __field(sector_t, sector ) | 97 | __field(sector_t, sector ) |
71 | __field(unsigned int, nr_sector ) | 98 | __field(unsigned int, nr_sector ) |
72 | __array(char, rwbs, 6 ) | 99 | __array(char, rwbs, 6 ) |
73 | __array(char, comm, TASK_COMM_LEN ) | ||
74 | ), | 100 | ), |
75 | 101 | ||
76 | TP_fast_assign( | 102 | TP_fast_assign( |
@@ -78,191 +104,328 @@ DECLARE_EVENT_CLASS(bcache_bio, | |||
78 | __entry->sector = bio->bi_sector; | 104 | __entry->sector = bio->bi_sector; |
79 | __entry->nr_sector = bio->bi_size >> 9; | 105 | __entry->nr_sector = bio->bi_size >> 9; |
80 | blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); | 106 | blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); |
81 | memcpy(__entry->comm, current->comm, TASK_COMM_LEN); | ||
82 | ), | 107 | ), |
83 | 108 | ||
84 | TP_printk("%d,%d %s %llu + %u [%s]", | 109 | TP_printk("%d,%d %s %llu + %u", |
85 | MAJOR(__entry->dev), MINOR(__entry->dev), | 110 | MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, |
86 | __entry->rwbs, | 111 | (unsigned long long)__entry->sector, __entry->nr_sector) |
87 | (unsigned long long)__entry->sector, | ||
88 | __entry->nr_sector, __entry->comm) | ||
89 | ); | 112 | ); |
90 | 113 | ||
91 | 114 | DEFINE_EVENT(bcache_bio, bcache_bypass_sequential, | |
92 | DEFINE_EVENT(bcache_bio, bcache_passthrough, | ||
93 | |||
94 | TP_PROTO(struct bio *bio), | 115 | TP_PROTO(struct bio *bio), |
116 | TP_ARGS(bio) | ||
117 | ); | ||
95 | 118 | ||
119 | DEFINE_EVENT(bcache_bio, bcache_bypass_congested, | ||
120 | TP_PROTO(struct bio *bio), | ||
96 | TP_ARGS(bio) | 121 | TP_ARGS(bio) |
97 | ); | 122 | ); |
98 | 123 | ||
99 | DEFINE_EVENT(bcache_bio, bcache_cache_hit, | 124 | TRACE_EVENT(bcache_read, |
125 | TP_PROTO(struct bio *bio, bool hit, bool bypass), | ||
126 | TP_ARGS(bio, hit, bypass), | ||
100 | 127 | ||
101 | TP_PROTO(struct bio *bio), | 128 | TP_STRUCT__entry( |
129 | __field(dev_t, dev ) | ||
130 | __field(sector_t, sector ) | ||
131 | __field(unsigned int, nr_sector ) | ||
132 | __array(char, rwbs, 6 ) | ||
133 | __field(bool, cache_hit ) | ||
134 | __field(bool, bypass ) | ||
135 | ), | ||
102 | 136 | ||
103 | TP_ARGS(bio) | 137 | TP_fast_assign( |
138 | __entry->dev = bio->bi_bdev->bd_dev; | ||
139 | __entry->sector = bio->bi_sector; | ||
140 | __entry->nr_sector = bio->bi_size >> 9; | ||
141 | blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); | ||
142 | __entry->cache_hit = hit; | ||
143 | __entry->bypass = bypass; | ||
144 | ), | ||
145 | |||
146 | TP_printk("%d,%d %s %llu + %u hit %u bypass %u", | ||
147 | MAJOR(__entry->dev), MINOR(__entry->dev), | ||
148 | __entry->rwbs, (unsigned long long)__entry->sector, | ||
149 | __entry->nr_sector, __entry->cache_hit, __entry->bypass) | ||
104 | ); | 150 | ); |
105 | 151 | ||
106 | DEFINE_EVENT(bcache_bio, bcache_cache_miss, | 152 | TRACE_EVENT(bcache_write, |
153 | TP_PROTO(struct bio *bio, bool writeback, bool bypass), | ||
154 | TP_ARGS(bio, writeback, bypass), | ||
107 | 155 | ||
108 | TP_PROTO(struct bio *bio), | 156 | TP_STRUCT__entry( |
157 | __field(dev_t, dev ) | ||
158 | __field(sector_t, sector ) | ||
159 | __field(unsigned int, nr_sector ) | ||
160 | __array(char, rwbs, 6 ) | ||
161 | __field(bool, writeback ) | ||
162 | __field(bool, bypass ) | ||
163 | ), | ||
109 | 164 | ||
110 | TP_ARGS(bio) | 165 | TP_fast_assign( |
166 | __entry->dev = bio->bi_bdev->bd_dev; | ||
167 | __entry->sector = bio->bi_sector; | ||
168 | __entry->nr_sector = bio->bi_size >> 9; | ||
169 | blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); | ||
170 | __entry->writeback = writeback; | ||
171 | __entry->bypass = bypass; | ||
172 | ), | ||
173 | |||
174 | TP_printk("%d,%d %s %llu + %u hit %u bypass %u", | ||
175 | MAJOR(__entry->dev), MINOR(__entry->dev), | ||
176 | __entry->rwbs, (unsigned long long)__entry->sector, | ||
177 | __entry->nr_sector, __entry->writeback, __entry->bypass) | ||
111 | ); | 178 | ); |
112 | 179 | ||
113 | DEFINE_EVENT(bcache_bio, bcache_read_retry, | 180 | DEFINE_EVENT(bcache_bio, bcache_read_retry, |
114 | |||
115 | TP_PROTO(struct bio *bio), | 181 | TP_PROTO(struct bio *bio), |
116 | |||
117 | TP_ARGS(bio) | 182 | TP_ARGS(bio) |
118 | ); | 183 | ); |
119 | 184 | ||
120 | DEFINE_EVENT(bcache_bio, bcache_writethrough, | 185 | DEFINE_EVENT(bkey, bcache_cache_insert, |
186 | TP_PROTO(struct bkey *k), | ||
187 | TP_ARGS(k) | ||
188 | ); | ||
121 | 189 | ||
122 | TP_PROTO(struct bio *bio), | 190 | /* Journal */ |
123 | 191 | ||
124 | TP_ARGS(bio) | 192 | DECLARE_EVENT_CLASS(cache_set, |
125 | ); | 193 | TP_PROTO(struct cache_set *c), |
194 | TP_ARGS(c), | ||
126 | 195 | ||
127 | DEFINE_EVENT(bcache_bio, bcache_writeback, | 196 | TP_STRUCT__entry( |
197 | __array(char, uuid, 16 ) | ||
198 | ), | ||
128 | 199 | ||
129 | TP_PROTO(struct bio *bio), | 200 | TP_fast_assign( |
201 | memcpy(__entry->uuid, c->sb.set_uuid, 16); | ||
202 | ), | ||
130 | 203 | ||
131 | TP_ARGS(bio) | 204 | TP_printk("%pU", __entry->uuid) |
132 | ); | 205 | ); |
133 | 206 | ||
134 | DEFINE_EVENT(bcache_bio, bcache_write_skip, | 207 | DEFINE_EVENT(bkey, bcache_journal_replay_key, |
135 | 208 | TP_PROTO(struct bkey *k), | |
136 | TP_PROTO(struct bio *bio), | 209 | TP_ARGS(k) |
210 | ); | ||
137 | 211 | ||
138 | TP_ARGS(bio) | 212 | DEFINE_EVENT(cache_set, bcache_journal_full, |
213 | TP_PROTO(struct cache_set *c), | ||
214 | TP_ARGS(c) | ||
139 | ); | 215 | ); |
140 | 216 | ||
141 | DEFINE_EVENT(bcache_bio, bcache_btree_read, | 217 | DEFINE_EVENT(cache_set, bcache_journal_entry_full, |
218 | TP_PROTO(struct cache_set *c), | ||
219 | TP_ARGS(c) | ||
220 | ); | ||
142 | 221 | ||
222 | DEFINE_EVENT(bcache_bio, bcache_journal_write, | ||
143 | TP_PROTO(struct bio *bio), | 223 | TP_PROTO(struct bio *bio), |
144 | |||
145 | TP_ARGS(bio) | 224 | TP_ARGS(bio) |
146 | ); | 225 | ); |
147 | 226 | ||
148 | DEFINE_EVENT(bcache_bio, bcache_btree_write, | 227 | /* Btree */ |
149 | 228 | ||
150 | TP_PROTO(struct bio *bio), | 229 | DEFINE_EVENT(cache_set, bcache_btree_cache_cannibalize, |
230 | TP_PROTO(struct cache_set *c), | ||
231 | TP_ARGS(c) | ||
232 | ); | ||
151 | 233 | ||
152 | TP_ARGS(bio) | 234 | DEFINE_EVENT(btree_node, bcache_btree_read, |
235 | TP_PROTO(struct btree *b), | ||
236 | TP_ARGS(b) | ||
153 | ); | 237 | ); |
154 | 238 | ||
155 | DEFINE_EVENT(bcache_bio, bcache_write_dirty, | 239 | TRACE_EVENT(bcache_btree_write, |
240 | TP_PROTO(struct btree *b), | ||
241 | TP_ARGS(b), | ||
156 | 242 | ||
157 | TP_PROTO(struct bio *bio), | 243 | TP_STRUCT__entry( |
244 | __field(size_t, bucket ) | ||
245 | __field(unsigned, block ) | ||
246 | __field(unsigned, keys ) | ||
247 | ), | ||
158 | 248 | ||
159 | TP_ARGS(bio) | 249 | TP_fast_assign( |
250 | __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0); | ||
251 | __entry->block = b->written; | ||
252 | __entry->keys = b->sets[b->nsets].data->keys; | ||
253 | ), | ||
254 | |||
255 | TP_printk("bucket %zu", __entry->bucket) | ||
160 | ); | 256 | ); |
161 | 257 | ||
162 | DEFINE_EVENT(bcache_bio, bcache_read_dirty, | 258 | DEFINE_EVENT(btree_node, bcache_btree_node_alloc, |
259 | TP_PROTO(struct btree *b), | ||
260 | TP_ARGS(b) | ||
261 | ); | ||
163 | 262 | ||
164 | TP_PROTO(struct bio *bio), | 263 | DEFINE_EVENT(btree_node, bcache_btree_node_alloc_fail, |
264 | TP_PROTO(struct btree *b), | ||
265 | TP_ARGS(b) | ||
266 | ); | ||
165 | 267 | ||
166 | TP_ARGS(bio) | 268 | DEFINE_EVENT(btree_node, bcache_btree_node_free, |
269 | TP_PROTO(struct btree *b), | ||
270 | TP_ARGS(b) | ||
167 | ); | 271 | ); |
168 | 272 | ||
169 | DEFINE_EVENT(bcache_bio, bcache_write_moving, | 273 | TRACE_EVENT(bcache_btree_gc_coalesce, |
274 | TP_PROTO(unsigned nodes), | ||
275 | TP_ARGS(nodes), | ||
170 | 276 | ||
171 | TP_PROTO(struct bio *bio), | 277 | TP_STRUCT__entry( |
278 | __field(unsigned, nodes ) | ||
279 | ), | ||
172 | 280 | ||
173 | TP_ARGS(bio) | 281 | TP_fast_assign( |
282 | __entry->nodes = nodes; | ||
283 | ), | ||
284 | |||
285 | TP_printk("coalesced %u nodes", __entry->nodes) | ||
174 | ); | 286 | ); |
175 | 287 | ||
176 | DEFINE_EVENT(bcache_bio, bcache_read_moving, | 288 | DEFINE_EVENT(cache_set, bcache_gc_start, |
289 | TP_PROTO(struct cache_set *c), | ||
290 | TP_ARGS(c) | ||
291 | ); | ||
177 | 292 | ||
178 | TP_PROTO(struct bio *bio), | 293 | DEFINE_EVENT(cache_set, bcache_gc_end, |
294 | TP_PROTO(struct cache_set *c), | ||
295 | TP_ARGS(c) | ||
296 | ); | ||
179 | 297 | ||
180 | TP_ARGS(bio) | 298 | DEFINE_EVENT(bkey, bcache_gc_copy, |
299 | TP_PROTO(struct bkey *k), | ||
300 | TP_ARGS(k) | ||
181 | ); | 301 | ); |
182 | 302 | ||
183 | DEFINE_EVENT(bcache_bio, bcache_journal_write, | 303 | DEFINE_EVENT(bkey, bcache_gc_copy_collision, |
304 | TP_PROTO(struct bkey *k), | ||
305 | TP_ARGS(k) | ||
306 | ); | ||
184 | 307 | ||
185 | TP_PROTO(struct bio *bio), | 308 | TRACE_EVENT(bcache_btree_insert_key, |
309 | TP_PROTO(struct btree *b, struct bkey *k, unsigned op, unsigned status), | ||
310 | TP_ARGS(b, k, op, status), | ||
186 | 311 | ||
187 | TP_ARGS(bio) | 312 | TP_STRUCT__entry( |
188 | ); | 313 | __field(u64, btree_node ) |
314 | __field(u32, btree_level ) | ||
315 | __field(u32, inode ) | ||
316 | __field(u64, offset ) | ||
317 | __field(u32, size ) | ||
318 | __field(u8, dirty ) | ||
319 | __field(u8, op ) | ||
320 | __field(u8, status ) | ||
321 | ), | ||
189 | 322 | ||
190 | DECLARE_EVENT_CLASS(bcache_cache_bio, | 323 | TP_fast_assign( |
324 | __entry->btree_node = PTR_BUCKET_NR(b->c, &b->key, 0); | ||
325 | __entry->btree_level = b->level; | ||
326 | __entry->inode = KEY_INODE(k); | ||
327 | __entry->offset = KEY_OFFSET(k); | ||
328 | __entry->size = KEY_SIZE(k); | ||
329 | __entry->dirty = KEY_DIRTY(k); | ||
330 | __entry->op = op; | ||
331 | __entry->status = status; | ||
332 | ), | ||
191 | 333 | ||
192 | TP_PROTO(struct bio *bio, | 334 | TP_printk("%u for %u at %llu(%u): %u:%llu len %u dirty %u", |
193 | sector_t orig_sector, | 335 | __entry->status, __entry->op, |
194 | struct block_device* orig_bdev), | 336 | __entry->btree_node, __entry->btree_level, |
337 | __entry->inode, __entry->offset, | ||
338 | __entry->size, __entry->dirty) | ||
339 | ); | ||
195 | 340 | ||
196 | TP_ARGS(bio, orig_sector, orig_bdev), | 341 | DECLARE_EVENT_CLASS(btree_split, |
342 | TP_PROTO(struct btree *b, unsigned keys), | ||
343 | TP_ARGS(b, keys), | ||
197 | 344 | ||
198 | TP_STRUCT__entry( | 345 | TP_STRUCT__entry( |
199 | __field(dev_t, dev ) | 346 | __field(size_t, bucket ) |
200 | __field(dev_t, orig_dev ) | 347 | __field(unsigned, keys ) |
201 | __field(sector_t, sector ) | ||
202 | __field(sector_t, orig_sector ) | ||
203 | __field(unsigned int, nr_sector ) | ||
204 | __array(char, rwbs, 6 ) | ||
205 | __array(char, comm, TASK_COMM_LEN ) | ||
206 | ), | 348 | ), |
207 | 349 | ||
208 | TP_fast_assign( | 350 | TP_fast_assign( |
209 | __entry->dev = bio->bi_bdev->bd_dev; | 351 | __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0); |
210 | __entry->orig_dev = orig_bdev->bd_dev; | 352 | __entry->keys = keys; |
211 | __entry->sector = bio->bi_sector; | ||
212 | __entry->orig_sector = orig_sector; | ||
213 | __entry->nr_sector = bio->bi_size >> 9; | ||
214 | blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); | ||
215 | memcpy(__entry->comm, current->comm, TASK_COMM_LEN); | ||
216 | ), | 353 | ), |
217 | 354 | ||
218 | TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d %llu)", | 355 | TP_printk("bucket %zu keys %u", __entry->bucket, __entry->keys) |
219 | MAJOR(__entry->dev), MINOR(__entry->dev), | ||
220 | __entry->rwbs, | ||
221 | (unsigned long long)__entry->sector, | ||
222 | __entry->nr_sector, __entry->comm, | ||
223 | MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev), | ||
224 | (unsigned long long)__entry->orig_sector) | ||
225 | ); | 356 | ); |
226 | 357 | ||
227 | DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert, | 358 | DEFINE_EVENT(btree_split, bcache_btree_node_split, |
228 | 359 | TP_PROTO(struct btree *b, unsigned keys), | |
229 | TP_PROTO(struct bio *bio, | 360 | TP_ARGS(b, keys) |
230 | sector_t orig_sector, | 361 | ); |
231 | struct block_device *orig_bdev), | ||
232 | 362 | ||
233 | TP_ARGS(bio, orig_sector, orig_bdev) | 363 | DEFINE_EVENT(btree_split, bcache_btree_node_compact, |
364 | TP_PROTO(struct btree *b, unsigned keys), | ||
365 | TP_ARGS(b, keys) | ||
234 | ); | 366 | ); |
235 | 367 | ||
236 | DECLARE_EVENT_CLASS(bcache_gc, | 368 | DEFINE_EVENT(btree_node, bcache_btree_set_root, |
369 | TP_PROTO(struct btree *b), | ||
370 | TP_ARGS(b) | ||
371 | ); | ||
237 | 372 | ||
238 | TP_PROTO(uint8_t *uuid), | 373 | /* Allocator */ |
239 | 374 | ||
240 | TP_ARGS(uuid), | 375 | TRACE_EVENT(bcache_alloc_invalidate, |
376 | TP_PROTO(struct cache *ca), | ||
377 | TP_ARGS(ca), | ||
241 | 378 | ||
242 | TP_STRUCT__entry( | 379 | TP_STRUCT__entry( |
243 | __field(uint8_t *, uuid) | 380 | __field(unsigned, free ) |
381 | __field(unsigned, free_inc ) | ||
382 | __field(unsigned, free_inc_size ) | ||
383 | __field(unsigned, unused ) | ||
244 | ), | 384 | ), |
245 | 385 | ||
246 | TP_fast_assign( | 386 | TP_fast_assign( |
247 | __entry->uuid = uuid; | 387 | __entry->free = fifo_used(&ca->free); |
388 | __entry->free_inc = fifo_used(&ca->free_inc); | ||
389 | __entry->free_inc_size = ca->free_inc.size; | ||
390 | __entry->unused = fifo_used(&ca->unused); | ||
248 | ), | 391 | ), |
249 | 392 | ||
250 | TP_printk("%pU", __entry->uuid) | 393 | TP_printk("free %u free_inc %u/%u unused %u", __entry->free, |
394 | __entry->free_inc, __entry->free_inc_size, __entry->unused) | ||
251 | ); | 395 | ); |
252 | 396 | ||
397 | TRACE_EVENT(bcache_alloc_fail, | ||
398 | TP_PROTO(struct cache *ca), | ||
399 | TP_ARGS(ca), | ||
253 | 400 | ||
254 | DEFINE_EVENT(bcache_gc, bcache_gc_start, | 401 | TP_STRUCT__entry( |
402 | __field(unsigned, free ) | ||
403 | __field(unsigned, free_inc ) | ||
404 | __field(unsigned, unused ) | ||
405 | __field(unsigned, blocked ) | ||
406 | ), | ||
255 | 407 | ||
256 | TP_PROTO(uint8_t *uuid), | 408 | TP_fast_assign( |
409 | __entry->free = fifo_used(&ca->free); | ||
410 | __entry->free_inc = fifo_used(&ca->free_inc); | ||
411 | __entry->unused = fifo_used(&ca->unused); | ||
412 | __entry->blocked = atomic_read(&ca->set->prio_blocked); | ||
413 | ), | ||
257 | 414 | ||
258 | TP_ARGS(uuid) | 415 | TP_printk("free %u free_inc %u unused %u blocked %u", __entry->free, |
416 | __entry->free_inc, __entry->unused, __entry->blocked) | ||
259 | ); | 417 | ); |
260 | 418 | ||
261 | DEFINE_EVENT(bcache_gc, bcache_gc_end, | 419 | /* Background writeback */ |
262 | 420 | ||
263 | TP_PROTO(uint8_t *uuid), | 421 | DEFINE_EVENT(bkey, bcache_writeback, |
422 | TP_PROTO(struct bkey *k), | ||
423 | TP_ARGS(k) | ||
424 | ); | ||
264 | 425 | ||
265 | TP_ARGS(uuid) | 426 | DEFINE_EVENT(bkey, bcache_writeback_collision, |
427 | TP_PROTO(struct bkey *k), | ||
428 | TP_ARGS(k) | ||
266 | ); | 429 | ); |
267 | 430 | ||
268 | #endif /* _TRACE_BCACHE_H */ | 431 | #endif /* _TRACE_BCACHE_H */ |
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index ffd4652de91c..65e12099ef89 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h | |||
@@ -103,12 +103,46 @@ typedef uint64_t blkif_sector_t; | |||
103 | #define BLKIF_OP_DISCARD 5 | 103 | #define BLKIF_OP_DISCARD 5 |
104 | 104 | ||
105 | /* | 105 | /* |
106 | * Recognized if "feature-max-indirect-segments" in present in the backend | ||
107 | * xenbus info. The "feature-max-indirect-segments" node contains the maximum | ||
108 | * number of segments allowed by the backend per request. If the node is | ||
109 | * present, the frontend might use blkif_request_indirect structs in order to | ||
110 | * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The | ||
111 | * maximum number of indirect segments is fixed by the backend, but the | ||
112 | * frontend can issue requests with any number of indirect segments as long as | ||
113 | * it's less than the number provided by the backend. The indirect_grefs field | ||
114 | * in blkif_request_indirect should be filled by the frontend with the | ||
115 | * grant references of the pages that are holding the indirect segments. | ||
116 | * This pages are filled with an array of blkif_request_segment_aligned | ||
117 | * that hold the information about the segments. The number of indirect | ||
118 | * pages to use is determined by the maximum number of segments | ||
119 | * a indirect request contains. Every indirect page can contain a maximum | ||
120 | * of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)), | ||
121 | * so to calculate the number of indirect pages to use we have to do | ||
122 | * ceil(indirect_segments/512). | ||
123 | * | ||
124 | * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not* | ||
125 | * create the "feature-max-indirect-segments" node! | ||
126 | */ | ||
127 | #define BLKIF_OP_INDIRECT 6 | ||
128 | |||
129 | /* | ||
106 | * Maximum scatter/gather segments per request. | 130 | * Maximum scatter/gather segments per request. |
107 | * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE. | 131 | * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE. |
108 | * NB. This could be 12 if the ring indexes weren't stored in the same page. | 132 | * NB. This could be 12 if the ring indexes weren't stored in the same page. |
109 | */ | 133 | */ |
110 | #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 | 134 | #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 |
111 | 135 | ||
136 | #define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8 | ||
137 | |||
138 | struct blkif_request_segment_aligned { | ||
139 | grant_ref_t gref; /* reference to I/O buffer frame */ | ||
140 | /* @first_sect: first sector in frame to transfer (inclusive). */ | ||
141 | /* @last_sect: last sector in frame to transfer (inclusive). */ | ||
142 | uint8_t first_sect, last_sect; | ||
143 | uint16_t _pad; /* padding to make it 8 bytes, so it's cache-aligned */ | ||
144 | } __attribute__((__packed__)); | ||
145 | |||
112 | struct blkif_request_rw { | 146 | struct blkif_request_rw { |
113 | uint8_t nr_segments; /* number of segments */ | 147 | uint8_t nr_segments; /* number of segments */ |
114 | blkif_vdev_t handle; /* only for read/write requests */ | 148 | blkif_vdev_t handle; /* only for read/write requests */ |
@@ -147,12 +181,31 @@ struct blkif_request_other { | |||
147 | uint64_t id; /* private guest value, echoed in resp */ | 181 | uint64_t id; /* private guest value, echoed in resp */ |
148 | } __attribute__((__packed__)); | 182 | } __attribute__((__packed__)); |
149 | 183 | ||
184 | struct blkif_request_indirect { | ||
185 | uint8_t indirect_op; | ||
186 | uint16_t nr_segments; | ||
187 | #ifdef CONFIG_X86_64 | ||
188 | uint32_t _pad1; /* offsetof(blkif_...,u.indirect.id) == 8 */ | ||
189 | #endif | ||
190 | uint64_t id; | ||
191 | blkif_sector_t sector_number; | ||
192 | blkif_vdev_t handle; | ||
193 | uint16_t _pad2; | ||
194 | grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; | ||
195 | #ifdef CONFIG_X86_64 | ||
196 | uint32_t _pad3; /* make it 64 byte aligned */ | ||
197 | #else | ||
198 | uint64_t _pad3; /* make it 64 byte aligned */ | ||
199 | #endif | ||
200 | } __attribute__((__packed__)); | ||
201 | |||
150 | struct blkif_request { | 202 | struct blkif_request { |
151 | uint8_t operation; /* BLKIF_OP_??? */ | 203 | uint8_t operation; /* BLKIF_OP_??? */ |
152 | union { | 204 | union { |
153 | struct blkif_request_rw rw; | 205 | struct blkif_request_rw rw; |
154 | struct blkif_request_discard discard; | 206 | struct blkif_request_discard discard; |
155 | struct blkif_request_other other; | 207 | struct blkif_request_other other; |
208 | struct blkif_request_indirect indirect; | ||
156 | } u; | 209 | } u; |
157 | } __attribute__((__packed__)); | 210 | } __attribute__((__packed__)); |
158 | 211 | ||
diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h index 75271b9a8f61..7d28aff605c7 100644 --- a/include/xen/interface/io/ring.h +++ b/include/xen/interface/io/ring.h | |||
@@ -188,6 +188,11 @@ struct __name##_back_ring { \ | |||
188 | #define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ | 188 | #define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ |
189 | (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) | 189 | (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) |
190 | 190 | ||
191 | /* Ill-behaved frontend determination: Can there be this many requests? */ | ||
192 | #define RING_REQUEST_PROD_OVERFLOW(_r, _prod) \ | ||
193 | (((_prod) - (_r)->rsp_prod_pvt) > RING_SIZE(_r)) | ||
194 | |||
195 | |||
191 | #define RING_PUSH_REQUESTS(_r) do { \ | 196 | #define RING_PUSH_REQUESTS(_r) do { \ |
192 | wmb(); /* back sees requests /before/ updated producer index */ \ | 197 | wmb(); /* back sees requests /before/ updated producer index */ \ |
193 | (_r)->sring->req_prod = (_r)->req_prod_pvt; \ | 198 | (_r)->sring->req_prod = (_r)->req_prod_pvt; \ |