diff options
Diffstat (limited to 'drivers/md')
51 files changed, 8658 insertions, 747 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 91a02eeeb319..4d8d90b4fe78 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
| @@ -154,17 +154,6 @@ config MD_RAID456 | |||
| 154 | 154 | ||
| 155 | If unsure, say Y. | 155 | If unsure, say Y. |
| 156 | 156 | ||
| 157 | config MULTICORE_RAID456 | ||
| 158 | bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)" | ||
| 159 | depends on MD_RAID456 | ||
| 160 | depends on SMP | ||
| 161 | depends on EXPERIMENTAL | ||
| 162 | ---help--- | ||
| 163 | Enable the raid456 module to dispatch per-stripe raid operations to a | ||
| 164 | thread pool. | ||
| 165 | |||
| 166 | If unsure, say N. | ||
| 167 | |||
| 168 | config MD_MULTIPATH | 157 | config MD_MULTIPATH |
| 169 | tristate "Multipath I/O support" | 158 | tristate "Multipath I/O support" |
| 170 | depends on BLK_DEV_MD | 159 | depends on BLK_DEV_MD |
| @@ -210,7 +199,7 @@ config DM_DEBUG | |||
| 210 | 199 | ||
| 211 | config DM_BUFIO | 200 | config DM_BUFIO |
| 212 | tristate | 201 | tristate |
| 213 | depends on BLK_DEV_DM && EXPERIMENTAL | 202 | depends on BLK_DEV_DM |
| 214 | ---help--- | 203 | ---help--- |
| 215 | This interface allows you to do buffered I/O on a device and acts | 204 | This interface allows you to do buffered I/O on a device and acts |
| 216 | as a cache, holding recently-read blocks in memory and performing | 205 | as a cache, holding recently-read blocks in memory and performing |
| @@ -218,7 +207,7 @@ config DM_BUFIO | |||
| 218 | 207 | ||
| 219 | config DM_BIO_PRISON | 208 | config DM_BIO_PRISON |
| 220 | tristate | 209 | tristate |
| 221 | depends on BLK_DEV_DM && EXPERIMENTAL | 210 | depends on BLK_DEV_DM |
| 222 | ---help--- | 211 | ---help--- |
| 223 | Some bio locking schemes used by other device-mapper targets | 212 | Some bio locking schemes used by other device-mapper targets |
| 224 | including thin provisioning. | 213 | including thin provisioning. |
| @@ -251,8 +240,8 @@ config DM_SNAPSHOT | |||
| 251 | Allow volume managers to take writable snapshots of a device. | 240 | Allow volume managers to take writable snapshots of a device. |
| 252 | 241 | ||
| 253 | config DM_THIN_PROVISIONING | 242 | config DM_THIN_PROVISIONING |
| 254 | tristate "Thin provisioning target (EXPERIMENTAL)" | 243 | tristate "Thin provisioning target" |
| 255 | depends on BLK_DEV_DM && EXPERIMENTAL | 244 | depends on BLK_DEV_DM |
| 256 | select DM_PERSISTENT_DATA | 245 | select DM_PERSISTENT_DATA |
| 257 | select DM_BIO_PRISON | 246 | select DM_BIO_PRISON |
| 258 | ---help--- | 247 | ---help--- |
| @@ -268,6 +257,37 @@ config DM_DEBUG_BLOCK_STACK_TRACING | |||
| 268 | 257 | ||
| 269 | If unsure, say N. | 258 | If unsure, say N. |
| 270 | 259 | ||
| 260 | config DM_CACHE | ||
| 261 | tristate "Cache target (EXPERIMENTAL)" | ||
| 262 | depends on BLK_DEV_DM | ||
| 263 | default n | ||
| 264 | select DM_PERSISTENT_DATA | ||
| 265 | select DM_BIO_PRISON | ||
| 266 | ---help--- | ||
| 267 | dm-cache attempts to improve performance of a block device by | ||
| 268 | moving frequently used data to a smaller, higher performance | ||
| 269 | device. Different 'policy' plugins can be used to change the | ||
| 270 | algorithms used to select which blocks are promoted, demoted, | ||
| 271 | cleaned etc. It supports writeback and writethrough modes. | ||
| 272 | |||
| 273 | config DM_CACHE_MQ | ||
| 274 | tristate "MQ Cache Policy (EXPERIMENTAL)" | ||
| 275 | depends on DM_CACHE | ||
| 276 | default y | ||
| 277 | ---help--- | ||
| 278 | A cache policy that uses a multiqueue ordered by recent hit | ||
| 279 | count to select which blocks should be promoted and demoted. | ||
| 280 | This is meant to be a general purpose policy. It prioritises | ||
| 281 | reads over writes. | ||
| 282 | |||
| 283 | config DM_CACHE_CLEANER | ||
| 284 | tristate "Cleaner Cache Policy (EXPERIMENTAL)" | ||
| 285 | depends on DM_CACHE | ||
| 286 | default y | ||
| 287 | ---help--- | ||
| 288 | A simple cache policy that writes back all data to the | ||
| 289 | origin. Used when decommissioning a dm-cache. | ||
| 290 | |||
| 271 | config DM_MIRROR | 291 | config DM_MIRROR |
| 272 | tristate "Mirror target" | 292 | tristate "Mirror target" |
| 273 | depends on BLK_DEV_DM | 293 | depends on BLK_DEV_DM |
| @@ -302,8 +322,8 @@ config DM_RAID | |||
| 302 | in one of the available parity distribution methods. | 322 | in one of the available parity distribution methods. |
| 303 | 323 | ||
| 304 | config DM_LOG_USERSPACE | 324 | config DM_LOG_USERSPACE |
| 305 | tristate "Mirror userspace logging (EXPERIMENTAL)" | 325 | tristate "Mirror userspace logging" |
| 306 | depends on DM_MIRROR && EXPERIMENTAL && NET | 326 | depends on DM_MIRROR && NET |
| 307 | select CONNECTOR | 327 | select CONNECTOR |
| 308 | ---help--- | 328 | ---help--- |
| 309 | The userspace logging module provides a mechanism for | 329 | The userspace logging module provides a mechanism for |
| @@ -350,8 +370,8 @@ config DM_MULTIPATH_ST | |||
| 350 | If unsure, say N. | 370 | If unsure, say N. |
| 351 | 371 | ||
| 352 | config DM_DELAY | 372 | config DM_DELAY |
| 353 | tristate "I/O delaying target (EXPERIMENTAL)" | 373 | tristate "I/O delaying target" |
| 354 | depends on BLK_DEV_DM && EXPERIMENTAL | 374 | depends on BLK_DEV_DM |
| 355 | ---help--- | 375 | ---help--- |
| 356 | A target that delays reads and/or writes and can send | 376 | A target that delays reads and/or writes and can send |
| 357 | them to different devices. Useful for testing. | 377 | them to different devices. Useful for testing. |
| @@ -365,14 +385,14 @@ config DM_UEVENT | |||
| 365 | Generate udev events for DM events. | 385 | Generate udev events for DM events. |
| 366 | 386 | ||
| 367 | config DM_FLAKEY | 387 | config DM_FLAKEY |
| 368 | tristate "Flakey target (EXPERIMENTAL)" | 388 | tristate "Flakey target" |
| 369 | depends on BLK_DEV_DM && EXPERIMENTAL | 389 | depends on BLK_DEV_DM |
| 370 | ---help--- | 390 | ---help--- |
| 371 | A target that intermittently fails I/O for debugging purposes. | 391 | A target that intermittently fails I/O for debugging purposes. |
| 372 | 392 | ||
| 373 | config DM_VERITY | 393 | config DM_VERITY |
| 374 | tristate "Verity target support (EXPERIMENTAL)" | 394 | tristate "Verity target support" |
| 375 | depends on BLK_DEV_DM && EXPERIMENTAL | 395 | depends on BLK_DEV_DM |
| 376 | select CRYPTO | 396 | select CRYPTO |
| 377 | select CRYPTO_HASH | 397 | select CRYPTO_HASH |
| 378 | select DM_BUFIO | 398 | select DM_BUFIO |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 94dce8b49324..7ceeaefc0e95 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
| @@ -11,6 +11,9 @@ dm-mirror-y += dm-raid1.o | |||
| 11 | dm-log-userspace-y \ | 11 | dm-log-userspace-y \ |
| 12 | += dm-log-userspace-base.o dm-log-userspace-transfer.o | 12 | += dm-log-userspace-base.o dm-log-userspace-transfer.o |
| 13 | dm-thin-pool-y += dm-thin.o dm-thin-metadata.o | 13 | dm-thin-pool-y += dm-thin.o dm-thin-metadata.o |
| 14 | dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o | ||
| 15 | dm-cache-mq-y += dm-cache-policy-mq.o | ||
| 16 | dm-cache-cleaner-y += dm-cache-policy-cleaner.o | ||
| 14 | md-mod-y += md.o bitmap.o | 17 | md-mod-y += md.o bitmap.o |
| 15 | raid456-y += raid5.o | 18 | raid456-y += raid5.o |
| 16 | 19 | ||
| @@ -44,6 +47,9 @@ obj-$(CONFIG_DM_ZERO) += dm-zero.o | |||
| 44 | obj-$(CONFIG_DM_RAID) += dm-raid.o | 47 | obj-$(CONFIG_DM_RAID) += dm-raid.o |
| 45 | obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o | 48 | obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o |
| 46 | obj-$(CONFIG_DM_VERITY) += dm-verity.o | 49 | obj-$(CONFIG_DM_VERITY) += dm-verity.o |
| 50 | obj-$(CONFIG_DM_CACHE) += dm-cache.o | ||
| 51 | obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o | ||
| 52 | obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o | ||
| 47 | 53 | ||
| 48 | ifeq ($(CONFIG_DM_UEVENT),y) | 54 | ifeq ($(CONFIG_DM_UEVENT),y) |
| 49 | dm-mod-objs += dm-uevent.o | 55 | dm-mod-objs += dm-uevent.o |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 7155945f8eb8..4fd9d6aeff6a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
| @@ -337,7 +337,7 @@ static int read_page(struct file *file, unsigned long index, | |||
| 337 | struct page *page) | 337 | struct page *page) |
| 338 | { | 338 | { |
| 339 | int ret = 0; | 339 | int ret = 0; |
| 340 | struct inode *inode = file->f_path.dentry->d_inode; | 340 | struct inode *inode = file_inode(file); |
| 341 | struct buffer_head *bh; | 341 | struct buffer_head *bh; |
| 342 | sector_t block; | 342 | sector_t block; |
| 343 | 343 | ||
| @@ -755,7 +755,7 @@ static void bitmap_file_unmap(struct bitmap_storage *store) | |||
| 755 | free_buffers(sb_page); | 755 | free_buffers(sb_page); |
| 756 | 756 | ||
| 757 | if (file) { | 757 | if (file) { |
| 758 | struct inode *inode = file->f_path.dentry->d_inode; | 758 | struct inode *inode = file_inode(file); |
| 759 | invalidate_mapping_pages(inode->i_mapping, 0, -1); | 759 | invalidate_mapping_pages(inode->i_mapping, 0, -1); |
| 760 | fput(file); | 760 | fput(file); |
| 761 | } | 761 | } |
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c index aefb78e3cbf9..85f0b7074257 100644 --- a/drivers/md/dm-bio-prison.c +++ b/drivers/md/dm-bio-prison.c | |||
| @@ -14,14 +14,6 @@ | |||
| 14 | 14 | ||
| 15 | /*----------------------------------------------------------------*/ | 15 | /*----------------------------------------------------------------*/ |
| 16 | 16 | ||
| 17 | struct dm_bio_prison_cell { | ||
| 18 | struct hlist_node list; | ||
| 19 | struct dm_bio_prison *prison; | ||
| 20 | struct dm_cell_key key; | ||
| 21 | struct bio *holder; | ||
| 22 | struct bio_list bios; | ||
| 23 | }; | ||
| 24 | |||
| 25 | struct dm_bio_prison { | 17 | struct dm_bio_prison { |
| 26 | spinlock_t lock; | 18 | spinlock_t lock; |
| 27 | mempool_t *cell_pool; | 19 | mempool_t *cell_pool; |
| @@ -87,6 +79,19 @@ void dm_bio_prison_destroy(struct dm_bio_prison *prison) | |||
| 87 | } | 79 | } |
| 88 | EXPORT_SYMBOL_GPL(dm_bio_prison_destroy); | 80 | EXPORT_SYMBOL_GPL(dm_bio_prison_destroy); |
| 89 | 81 | ||
| 82 | struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, gfp_t gfp) | ||
| 83 | { | ||
| 84 | return mempool_alloc(prison->cell_pool, gfp); | ||
| 85 | } | ||
| 86 | EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell); | ||
| 87 | |||
| 88 | void dm_bio_prison_free_cell(struct dm_bio_prison *prison, | ||
| 89 | struct dm_bio_prison_cell *cell) | ||
| 90 | { | ||
| 91 | mempool_free(cell, prison->cell_pool); | ||
| 92 | } | ||
| 93 | EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell); | ||
| 94 | |||
| 90 | static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key) | 95 | static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key) |
| 91 | { | 96 | { |
| 92 | const unsigned long BIG_PRIME = 4294967291UL; | 97 | const unsigned long BIG_PRIME = 4294967291UL; |
| @@ -106,100 +111,103 @@ static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket, | |||
| 106 | struct dm_cell_key *key) | 111 | struct dm_cell_key *key) |
| 107 | { | 112 | { |
| 108 | struct dm_bio_prison_cell *cell; | 113 | struct dm_bio_prison_cell *cell; |
| 109 | struct hlist_node *tmp; | ||
| 110 | 114 | ||
| 111 | hlist_for_each_entry(cell, tmp, bucket, list) | 115 | hlist_for_each_entry(cell, bucket, list) |
| 112 | if (keys_equal(&cell->key, key)) | 116 | if (keys_equal(&cell->key, key)) |
| 113 | return cell; | 117 | return cell; |
| 114 | 118 | ||
| 115 | return NULL; | 119 | return NULL; |
| 116 | } | 120 | } |
| 117 | 121 | ||
| 118 | /* | 122 | static void __setup_new_cell(struct dm_bio_prison *prison, |
| 119 | * This may block if a new cell needs allocating. You must ensure that | 123 | struct dm_cell_key *key, |
| 120 | * cells will be unlocked even if the calling thread is blocked. | 124 | struct bio *holder, |
| 121 | * | 125 | uint32_t hash, |
| 122 | * Returns 1 if the cell was already held, 0 if @inmate is the new holder. | 126 | struct dm_bio_prison_cell *cell) |
| 123 | */ | ||
| 124 | int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key, | ||
| 125 | struct bio *inmate, struct dm_bio_prison_cell **ref) | ||
| 126 | { | 127 | { |
| 127 | int r = 1; | 128 | memcpy(&cell->key, key, sizeof(cell->key)); |
| 128 | unsigned long flags; | 129 | cell->holder = holder; |
| 129 | uint32_t hash = hash_key(prison, key); | 130 | bio_list_init(&cell->bios); |
| 130 | struct dm_bio_prison_cell *cell, *cell2; | 131 | hlist_add_head(&cell->list, prison->cells + hash); |
| 131 | 132 | } | |
| 132 | BUG_ON(hash > prison->nr_buckets); | ||
| 133 | |||
| 134 | spin_lock_irqsave(&prison->lock, flags); | ||
| 135 | |||
| 136 | cell = __search_bucket(prison->cells + hash, key); | ||
| 137 | if (cell) { | ||
| 138 | bio_list_add(&cell->bios, inmate); | ||
| 139 | goto out; | ||
| 140 | } | ||
| 141 | 133 | ||
| 142 | /* | 134 | static int __bio_detain(struct dm_bio_prison *prison, |
| 143 | * Allocate a new cell | 135 | struct dm_cell_key *key, |
| 144 | */ | 136 | struct bio *inmate, |
| 145 | spin_unlock_irqrestore(&prison->lock, flags); | 137 | struct dm_bio_prison_cell *cell_prealloc, |
| 146 | cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); | 138 | struct dm_bio_prison_cell **cell_result) |
| 147 | spin_lock_irqsave(&prison->lock, flags); | 139 | { |
| 140 | uint32_t hash = hash_key(prison, key); | ||
| 141 | struct dm_bio_prison_cell *cell; | ||
| 148 | 142 | ||
| 149 | /* | ||
| 150 | * We've been unlocked, so we have to double check that | ||
| 151 | * nobody else has inserted this cell in the meantime. | ||
| 152 | */ | ||
| 153 | cell = __search_bucket(prison->cells + hash, key); | 143 | cell = __search_bucket(prison->cells + hash, key); |
| 154 | if (cell) { | 144 | if (cell) { |
| 155 | mempool_free(cell2, prison->cell_pool); | 145 | if (inmate) |
| 156 | bio_list_add(&cell->bios, inmate); | 146 | bio_list_add(&cell->bios, inmate); |
| 157 | goto out; | 147 | *cell_result = cell; |
| 148 | return 1; | ||
| 158 | } | 149 | } |
| 159 | 150 | ||
| 160 | /* | 151 | __setup_new_cell(prison, key, inmate, hash, cell_prealloc); |
| 161 | * Use new cell. | 152 | *cell_result = cell_prealloc; |
| 162 | */ | 153 | return 0; |
| 163 | cell = cell2; | 154 | } |
| 164 | |||
| 165 | cell->prison = prison; | ||
| 166 | memcpy(&cell->key, key, sizeof(cell->key)); | ||
| 167 | cell->holder = inmate; | ||
| 168 | bio_list_init(&cell->bios); | ||
| 169 | hlist_add_head(&cell->list, prison->cells + hash); | ||
| 170 | 155 | ||
| 171 | r = 0; | 156 | static int bio_detain(struct dm_bio_prison *prison, |
| 157 | struct dm_cell_key *key, | ||
| 158 | struct bio *inmate, | ||
| 159 | struct dm_bio_prison_cell *cell_prealloc, | ||
| 160 | struct dm_bio_prison_cell **cell_result) | ||
| 161 | { | ||
| 162 | int r; | ||
| 163 | unsigned long flags; | ||
| 172 | 164 | ||
| 173 | out: | 165 | spin_lock_irqsave(&prison->lock, flags); |
| 166 | r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result); | ||
| 174 | spin_unlock_irqrestore(&prison->lock, flags); | 167 | spin_unlock_irqrestore(&prison->lock, flags); |
| 175 | 168 | ||
| 176 | *ref = cell; | ||
| 177 | |||
| 178 | return r; | 169 | return r; |
| 179 | } | 170 | } |
| 171 | |||
| 172 | int dm_bio_detain(struct dm_bio_prison *prison, | ||
| 173 | struct dm_cell_key *key, | ||
| 174 | struct bio *inmate, | ||
| 175 | struct dm_bio_prison_cell *cell_prealloc, | ||
| 176 | struct dm_bio_prison_cell **cell_result) | ||
| 177 | { | ||
| 178 | return bio_detain(prison, key, inmate, cell_prealloc, cell_result); | ||
| 179 | } | ||
| 180 | EXPORT_SYMBOL_GPL(dm_bio_detain); | 180 | EXPORT_SYMBOL_GPL(dm_bio_detain); |
| 181 | 181 | ||
| 182 | int dm_get_cell(struct dm_bio_prison *prison, | ||
| 183 | struct dm_cell_key *key, | ||
| 184 | struct dm_bio_prison_cell *cell_prealloc, | ||
| 185 | struct dm_bio_prison_cell **cell_result) | ||
| 186 | { | ||
| 187 | return bio_detain(prison, key, NULL, cell_prealloc, cell_result); | ||
| 188 | } | ||
| 189 | EXPORT_SYMBOL_GPL(dm_get_cell); | ||
| 190 | |||
| 182 | /* | 191 | /* |
| 183 | * @inmates must have been initialised prior to this call | 192 | * @inmates must have been initialised prior to this call |
| 184 | */ | 193 | */ |
| 185 | static void __cell_release(struct dm_bio_prison_cell *cell, struct bio_list *inmates) | 194 | static void __cell_release(struct dm_bio_prison_cell *cell, |
| 195 | struct bio_list *inmates) | ||
| 186 | { | 196 | { |
| 187 | struct dm_bio_prison *prison = cell->prison; | ||
| 188 | |||
| 189 | hlist_del(&cell->list); | 197 | hlist_del(&cell->list); |
| 190 | 198 | ||
| 191 | if (inmates) { | 199 | if (inmates) { |
| 192 | bio_list_add(inmates, cell->holder); | 200 | if (cell->holder) |
| 201 | bio_list_add(inmates, cell->holder); | ||
| 193 | bio_list_merge(inmates, &cell->bios); | 202 | bio_list_merge(inmates, &cell->bios); |
| 194 | } | 203 | } |
| 195 | |||
| 196 | mempool_free(cell, prison->cell_pool); | ||
| 197 | } | 204 | } |
| 198 | 205 | ||
| 199 | void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios) | 206 | void dm_cell_release(struct dm_bio_prison *prison, |
| 207 | struct dm_bio_prison_cell *cell, | ||
| 208 | struct bio_list *bios) | ||
| 200 | { | 209 | { |
| 201 | unsigned long flags; | 210 | unsigned long flags; |
| 202 | struct dm_bio_prison *prison = cell->prison; | ||
| 203 | 211 | ||
| 204 | spin_lock_irqsave(&prison->lock, flags); | 212 | spin_lock_irqsave(&prison->lock, flags); |
| 205 | __cell_release(cell, bios); | 213 | __cell_release(cell, bios); |
| @@ -210,20 +218,18 @@ EXPORT_SYMBOL_GPL(dm_cell_release); | |||
| 210 | /* | 218 | /* |
| 211 | * Sometimes we don't want the holder, just the additional bios. | 219 | * Sometimes we don't want the holder, just the additional bios. |
| 212 | */ | 220 | */ |
| 213 | static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates) | 221 | static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, |
| 222 | struct bio_list *inmates) | ||
| 214 | { | 223 | { |
| 215 | struct dm_bio_prison *prison = cell->prison; | ||
| 216 | |||
| 217 | hlist_del(&cell->list); | 224 | hlist_del(&cell->list); |
| 218 | bio_list_merge(inmates, &cell->bios); | 225 | bio_list_merge(inmates, &cell->bios); |
| 219 | |||
| 220 | mempool_free(cell, prison->cell_pool); | ||
| 221 | } | 226 | } |
| 222 | 227 | ||
| 223 | void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates) | 228 | void dm_cell_release_no_holder(struct dm_bio_prison *prison, |
| 229 | struct dm_bio_prison_cell *cell, | ||
| 230 | struct bio_list *inmates) | ||
| 224 | { | 231 | { |
| 225 | unsigned long flags; | 232 | unsigned long flags; |
| 226 | struct dm_bio_prison *prison = cell->prison; | ||
| 227 | 233 | ||
| 228 | spin_lock_irqsave(&prison->lock, flags); | 234 | spin_lock_irqsave(&prison->lock, flags); |
| 229 | __cell_release_no_holder(cell, inmates); | 235 | __cell_release_no_holder(cell, inmates); |
| @@ -231,9 +237,9 @@ void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list | |||
| 231 | } | 237 | } |
| 232 | EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); | 238 | EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); |
| 233 | 239 | ||
| 234 | void dm_cell_error(struct dm_bio_prison_cell *cell) | 240 | void dm_cell_error(struct dm_bio_prison *prison, |
| 241 | struct dm_bio_prison_cell *cell) | ||
| 235 | { | 242 | { |
| 236 | struct dm_bio_prison *prison = cell->prison; | ||
| 237 | struct bio_list bios; | 243 | struct bio_list bios; |
| 238 | struct bio *bio; | 244 | struct bio *bio; |
| 239 | unsigned long flags; | 245 | unsigned long flags; |
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h index 53d1a7a84e2f..3f833190eadf 100644 --- a/drivers/md/dm-bio-prison.h +++ b/drivers/md/dm-bio-prison.h | |||
| @@ -22,7 +22,6 @@ | |||
| 22 | * subsequently unlocked the bios become available. | 22 | * subsequently unlocked the bios become available. |
| 23 | */ | 23 | */ |
| 24 | struct dm_bio_prison; | 24 | struct dm_bio_prison; |
| 25 | struct dm_bio_prison_cell; | ||
| 26 | 25 | ||
| 27 | /* FIXME: this needs to be more abstract */ | 26 | /* FIXME: this needs to be more abstract */ |
| 28 | struct dm_cell_key { | 27 | struct dm_cell_key { |
| @@ -31,21 +30,62 @@ struct dm_cell_key { | |||
| 31 | dm_block_t block; | 30 | dm_block_t block; |
| 32 | }; | 31 | }; |
| 33 | 32 | ||
| 33 | /* | ||
| 34 | * Treat this as opaque, only in header so callers can manage allocation | ||
| 35 | * themselves. | ||
| 36 | */ | ||
| 37 | struct dm_bio_prison_cell { | ||
| 38 | struct hlist_node list; | ||
| 39 | struct dm_cell_key key; | ||
| 40 | struct bio *holder; | ||
| 41 | struct bio_list bios; | ||
| 42 | }; | ||
| 43 | |||
| 34 | struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells); | 44 | struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells); |
| 35 | void dm_bio_prison_destroy(struct dm_bio_prison *prison); | 45 | void dm_bio_prison_destroy(struct dm_bio_prison *prison); |
| 36 | 46 | ||
| 37 | /* | 47 | /* |
| 38 | * This may block if a new cell needs allocating. You must ensure that | 48 | * These two functions just wrap a mempool. This is a transitory step: |
| 39 | * cells will be unlocked even if the calling thread is blocked. | 49 | * Eventually all bio prison clients should manage their own cell memory. |
| 40 | * | 50 | * |
| 41 | * Returns 1 if the cell was already held, 0 if @inmate is the new holder. | 51 | * Like mempool_alloc(), dm_bio_prison_alloc_cell() can only fail if called |
| 52 | * in interrupt context or passed GFP_NOWAIT. | ||
| 42 | */ | 53 | */ |
| 43 | int dm_bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key, | 54 | struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, |
| 44 | struct bio *inmate, struct dm_bio_prison_cell **ref); | 55 | gfp_t gfp); |
| 56 | void dm_bio_prison_free_cell(struct dm_bio_prison *prison, | ||
| 57 | struct dm_bio_prison_cell *cell); | ||
| 45 | 58 | ||
| 46 | void dm_cell_release(struct dm_bio_prison_cell *cell, struct bio_list *bios); | 59 | /* |
| 47 | void dm_cell_release_no_holder(struct dm_bio_prison_cell *cell, struct bio_list *inmates); | 60 | * Creates, or retrieves a cell for the given key. |
| 48 | void dm_cell_error(struct dm_bio_prison_cell *cell); | 61 | * |
| 62 | * Returns 1 if pre-existing cell returned, zero if new cell created using | ||
| 63 | * @cell_prealloc. | ||
| 64 | */ | ||
| 65 | int dm_get_cell(struct dm_bio_prison *prison, | ||
| 66 | struct dm_cell_key *key, | ||
| 67 | struct dm_bio_prison_cell *cell_prealloc, | ||
| 68 | struct dm_bio_prison_cell **cell_result); | ||
| 69 | |||
| 70 | /* | ||
| 71 | * An atomic op that combines retrieving a cell, and adding a bio to it. | ||
| 72 | * | ||
| 73 | * Returns 1 if the cell was already held, 0 if @inmate is the new holder. | ||
| 74 | */ | ||
| 75 | int dm_bio_detain(struct dm_bio_prison *prison, | ||
| 76 | struct dm_cell_key *key, | ||
| 77 | struct bio *inmate, | ||
| 78 | struct dm_bio_prison_cell *cell_prealloc, | ||
| 79 | struct dm_bio_prison_cell **cell_result); | ||
| 80 | |||
| 81 | void dm_cell_release(struct dm_bio_prison *prison, | ||
| 82 | struct dm_bio_prison_cell *cell, | ||
| 83 | struct bio_list *bios); | ||
| 84 | void dm_cell_release_no_holder(struct dm_bio_prison *prison, | ||
| 85 | struct dm_bio_prison_cell *cell, | ||
| 86 | struct bio_list *inmates); | ||
| 87 | void dm_cell_error(struct dm_bio_prison *prison, | ||
| 88 | struct dm_bio_prison_cell *cell); | ||
| 49 | 89 | ||
| 50 | /*----------------------------------------------------------------*/ | 90 | /*----------------------------------------------------------------*/ |
| 51 | 91 | ||
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 651ca79881dd..3c955e10a618 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c | |||
| @@ -859,9 +859,8 @@ static void __check_watermark(struct dm_bufio_client *c) | |||
| 859 | static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) | 859 | static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) |
| 860 | { | 860 | { |
| 861 | struct dm_buffer *b; | 861 | struct dm_buffer *b; |
| 862 | struct hlist_node *hn; | ||
| 863 | 862 | ||
| 864 | hlist_for_each_entry(b, hn, &c->cache_hash[DM_BUFIO_HASH(block)], | 863 | hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)], |
| 865 | hash_list) { | 864 | hash_list) { |
| 866 | dm_bufio_cond_resched(); | 865 | dm_bufio_cond_resched(); |
| 867 | if (b->block == block) | 866 | if (b->block == block) |
| @@ -1193,7 +1192,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); | |||
| 1193 | int dm_bufio_issue_flush(struct dm_bufio_client *c) | 1192 | int dm_bufio_issue_flush(struct dm_bufio_client *c) |
| 1194 | { | 1193 | { |
| 1195 | struct dm_io_request io_req = { | 1194 | struct dm_io_request io_req = { |
| 1196 | .bi_rw = REQ_FLUSH, | 1195 | .bi_rw = WRITE_FLUSH, |
| 1197 | .mem.type = DM_IO_KMEM, | 1196 | .mem.type = DM_IO_KMEM, |
| 1198 | .mem.ptr.addr = NULL, | 1197 | .mem.ptr.addr = NULL, |
| 1199 | .client = c->dm_io, | 1198 | .client = c->dm_io, |
diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h new file mode 100644 index 000000000000..bed4ad4e1b7c --- /dev/null +++ b/drivers/md/dm-cache-block-types.h | |||
| @@ -0,0 +1,54 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * This file is released under the GPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #ifndef DM_CACHE_BLOCK_TYPES_H | ||
| 8 | #define DM_CACHE_BLOCK_TYPES_H | ||
| 9 | |||
| 10 | #include "persistent-data/dm-block-manager.h" | ||
| 11 | |||
| 12 | /*----------------------------------------------------------------*/ | ||
| 13 | |||
| 14 | /* | ||
| 15 | * It's helpful to get sparse to differentiate between indexes into the | ||
| 16 | * origin device, indexes into the cache device, and indexes into the | ||
| 17 | * discard bitset. | ||
| 18 | */ | ||
| 19 | |||
| 20 | typedef dm_block_t __bitwise__ dm_oblock_t; | ||
| 21 | typedef uint32_t __bitwise__ dm_cblock_t; | ||
| 22 | typedef dm_block_t __bitwise__ dm_dblock_t; | ||
| 23 | |||
| 24 | static inline dm_oblock_t to_oblock(dm_block_t b) | ||
| 25 | { | ||
| 26 | return (__force dm_oblock_t) b; | ||
| 27 | } | ||
| 28 | |||
| 29 | static inline dm_block_t from_oblock(dm_oblock_t b) | ||
| 30 | { | ||
| 31 | return (__force dm_block_t) b; | ||
| 32 | } | ||
| 33 | |||
| 34 | static inline dm_cblock_t to_cblock(uint32_t b) | ||
| 35 | { | ||
| 36 | return (__force dm_cblock_t) b; | ||
| 37 | } | ||
| 38 | |||
| 39 | static inline uint32_t from_cblock(dm_cblock_t b) | ||
| 40 | { | ||
| 41 | return (__force uint32_t) b; | ||
| 42 | } | ||
| 43 | |||
| 44 | static inline dm_dblock_t to_dblock(dm_block_t b) | ||
| 45 | { | ||
| 46 | return (__force dm_dblock_t) b; | ||
| 47 | } | ||
| 48 | |||
| 49 | static inline dm_block_t from_dblock(dm_dblock_t b) | ||
| 50 | { | ||
| 51 | return (__force dm_block_t) b; | ||
| 52 | } | ||
| 53 | |||
| 54 | #endif /* DM_CACHE_BLOCK_TYPES_H */ | ||
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c new file mode 100644 index 000000000000..fbd3625f2748 --- /dev/null +++ b/drivers/md/dm-cache-metadata.c | |||
| @@ -0,0 +1,1146 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * This file is released under the GPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include "dm-cache-metadata.h" | ||
| 8 | |||
| 9 | #include "persistent-data/dm-array.h" | ||
| 10 | #include "persistent-data/dm-bitset.h" | ||
| 11 | #include "persistent-data/dm-space-map.h" | ||
| 12 | #include "persistent-data/dm-space-map-disk.h" | ||
| 13 | #include "persistent-data/dm-transaction-manager.h" | ||
| 14 | |||
| 15 | #include <linux/device-mapper.h> | ||
| 16 | |||
| 17 | /*----------------------------------------------------------------*/ | ||
| 18 | |||
| 19 | #define DM_MSG_PREFIX "cache metadata" | ||
| 20 | |||
| 21 | #define CACHE_SUPERBLOCK_MAGIC 06142003 | ||
| 22 | #define CACHE_SUPERBLOCK_LOCATION 0 | ||
| 23 | #define CACHE_VERSION 1 | ||
| 24 | #define CACHE_METADATA_CACHE_SIZE 64 | ||
| 25 | |||
| 26 | /* | ||
| 27 | * 3 for btree insert + | ||
| 28 | * 2 for btree lookup used within space map | ||
| 29 | */ | ||
| 30 | #define CACHE_MAX_CONCURRENT_LOCKS 5 | ||
| 31 | #define SPACE_MAP_ROOT_SIZE 128 | ||
| 32 | |||
| 33 | enum superblock_flag_bits { | ||
| 34 | /* for spotting crashes that would invalidate the dirty bitset */ | ||
| 35 | CLEAN_SHUTDOWN, | ||
| 36 | }; | ||
| 37 | |||
| 38 | /* | ||
| 39 | * Each mapping from cache block -> origin block carries a set of flags. | ||
| 40 | */ | ||
| 41 | enum mapping_bits { | ||
| 42 | /* | ||
| 43 | * A valid mapping. Because we're using an array we clear this | ||
| 44 | * flag for an non existant mapping. | ||
| 45 | */ | ||
| 46 | M_VALID = 1, | ||
| 47 | |||
| 48 | /* | ||
| 49 | * The data on the cache is different from that on the origin. | ||
| 50 | */ | ||
| 51 | M_DIRTY = 2 | ||
| 52 | }; | ||
| 53 | |||
| 54 | struct cache_disk_superblock { | ||
| 55 | __le32 csum; | ||
| 56 | __le32 flags; | ||
| 57 | __le64 blocknr; | ||
| 58 | |||
| 59 | __u8 uuid[16]; | ||
| 60 | __le64 magic; | ||
| 61 | __le32 version; | ||
| 62 | |||
| 63 | __u8 policy_name[CACHE_POLICY_NAME_SIZE]; | ||
| 64 | __le32 policy_hint_size; | ||
| 65 | |||
| 66 | __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; | ||
| 67 | __le64 mapping_root; | ||
| 68 | __le64 hint_root; | ||
| 69 | |||
| 70 | __le64 discard_root; | ||
| 71 | __le64 discard_block_size; | ||
| 72 | __le64 discard_nr_blocks; | ||
| 73 | |||
| 74 | __le32 data_block_size; | ||
| 75 | __le32 metadata_block_size; | ||
| 76 | __le32 cache_blocks; | ||
| 77 | |||
| 78 | __le32 compat_flags; | ||
| 79 | __le32 compat_ro_flags; | ||
| 80 | __le32 incompat_flags; | ||
| 81 | |||
| 82 | __le32 read_hits; | ||
| 83 | __le32 read_misses; | ||
| 84 | __le32 write_hits; | ||
| 85 | __le32 write_misses; | ||
| 86 | } __packed; | ||
| 87 | |||
| 88 | struct dm_cache_metadata { | ||
| 89 | struct block_device *bdev; | ||
| 90 | struct dm_block_manager *bm; | ||
| 91 | struct dm_space_map *metadata_sm; | ||
| 92 | struct dm_transaction_manager *tm; | ||
| 93 | |||
| 94 | struct dm_array_info info; | ||
| 95 | struct dm_array_info hint_info; | ||
| 96 | struct dm_disk_bitset discard_info; | ||
| 97 | |||
| 98 | struct rw_semaphore root_lock; | ||
| 99 | dm_block_t root; | ||
| 100 | dm_block_t hint_root; | ||
| 101 | dm_block_t discard_root; | ||
| 102 | |||
| 103 | sector_t discard_block_size; | ||
| 104 | dm_dblock_t discard_nr_blocks; | ||
| 105 | |||
| 106 | sector_t data_block_size; | ||
| 107 | dm_cblock_t cache_blocks; | ||
| 108 | bool changed:1; | ||
| 109 | bool clean_when_opened:1; | ||
| 110 | |||
| 111 | char policy_name[CACHE_POLICY_NAME_SIZE]; | ||
| 112 | size_t policy_hint_size; | ||
| 113 | struct dm_cache_statistics stats; | ||
| 114 | }; | ||
| 115 | |||
| 116 | /*------------------------------------------------------------------- | ||
| 117 | * superblock validator | ||
| 118 | *-----------------------------------------------------------------*/ | ||
| 119 | |||
| 120 | #define SUPERBLOCK_CSUM_XOR 9031977 | ||
| 121 | |||
| 122 | static void sb_prepare_for_write(struct dm_block_validator *v, | ||
| 123 | struct dm_block *b, | ||
| 124 | size_t sb_block_size) | ||
| 125 | { | ||
| 126 | struct cache_disk_superblock *disk_super = dm_block_data(b); | ||
| 127 | |||
| 128 | disk_super->blocknr = cpu_to_le64(dm_block_location(b)); | ||
| 129 | disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, | ||
| 130 | sb_block_size - sizeof(__le32), | ||
| 131 | SUPERBLOCK_CSUM_XOR)); | ||
| 132 | } | ||
| 133 | |||
| 134 | static int sb_check(struct dm_block_validator *v, | ||
| 135 | struct dm_block *b, | ||
| 136 | size_t sb_block_size) | ||
| 137 | { | ||
| 138 | struct cache_disk_superblock *disk_super = dm_block_data(b); | ||
| 139 | __le32 csum_le; | ||
| 140 | |||
| 141 | if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) { | ||
| 142 | DMERR("sb_check failed: blocknr %llu: wanted %llu", | ||
| 143 | le64_to_cpu(disk_super->blocknr), | ||
| 144 | (unsigned long long)dm_block_location(b)); | ||
| 145 | return -ENOTBLK; | ||
| 146 | } | ||
| 147 | |||
| 148 | if (le64_to_cpu(disk_super->magic) != CACHE_SUPERBLOCK_MAGIC) { | ||
| 149 | DMERR("sb_check failed: magic %llu: wanted %llu", | ||
| 150 | le64_to_cpu(disk_super->magic), | ||
| 151 | (unsigned long long)CACHE_SUPERBLOCK_MAGIC); | ||
| 152 | return -EILSEQ; | ||
| 153 | } | ||
| 154 | |||
| 155 | csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, | ||
| 156 | sb_block_size - sizeof(__le32), | ||
| 157 | SUPERBLOCK_CSUM_XOR)); | ||
| 158 | if (csum_le != disk_super->csum) { | ||
| 159 | DMERR("sb_check failed: csum %u: wanted %u", | ||
| 160 | le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); | ||
| 161 | return -EILSEQ; | ||
| 162 | } | ||
| 163 | |||
| 164 | return 0; | ||
| 165 | } | ||
| 166 | |||
| 167 | static struct dm_block_validator sb_validator = { | ||
| 168 | .name = "superblock", | ||
| 169 | .prepare_for_write = sb_prepare_for_write, | ||
| 170 | .check = sb_check | ||
| 171 | }; | ||
| 172 | |||
| 173 | /*----------------------------------------------------------------*/ | ||
| 174 | |||
| 175 | static int superblock_read_lock(struct dm_cache_metadata *cmd, | ||
| 176 | struct dm_block **sblock) | ||
| 177 | { | ||
| 178 | return dm_bm_read_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION, | ||
| 179 | &sb_validator, sblock); | ||
| 180 | } | ||
| 181 | |||
| 182 | static int superblock_lock_zero(struct dm_cache_metadata *cmd, | ||
| 183 | struct dm_block **sblock) | ||
| 184 | { | ||
| 185 | return dm_bm_write_lock_zero(cmd->bm, CACHE_SUPERBLOCK_LOCATION, | ||
| 186 | &sb_validator, sblock); | ||
| 187 | } | ||
| 188 | |||
| 189 | static int superblock_lock(struct dm_cache_metadata *cmd, | ||
| 190 | struct dm_block **sblock) | ||
| 191 | { | ||
| 192 | return dm_bm_write_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION, | ||
| 193 | &sb_validator, sblock); | ||
| 194 | } | ||
| 195 | |||
| 196 | /*----------------------------------------------------------------*/ | ||
| 197 | |||
| 198 | static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) | ||
| 199 | { | ||
| 200 | int r; | ||
| 201 | unsigned i; | ||
| 202 | struct dm_block *b; | ||
| 203 | __le64 *data_le, zero = cpu_to_le64(0); | ||
| 204 | unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64); | ||
| 205 | |||
| 206 | /* | ||
| 207 | * We can't use a validator here - it may be all zeroes. | ||
| 208 | */ | ||
| 209 | r = dm_bm_read_lock(bm, CACHE_SUPERBLOCK_LOCATION, NULL, &b); | ||
| 210 | if (r) | ||
| 211 | return r; | ||
| 212 | |||
| 213 | data_le = dm_block_data(b); | ||
| 214 | *result = 1; | ||
| 215 | for (i = 0; i < sb_block_size; i++) { | ||
| 216 | if (data_le[i] != zero) { | ||
| 217 | *result = 0; | ||
| 218 | break; | ||
| 219 | } | ||
| 220 | } | ||
| 221 | |||
| 222 | return dm_bm_unlock(b); | ||
| 223 | } | ||
| 224 | |||
| 225 | static void __setup_mapping_info(struct dm_cache_metadata *cmd) | ||
| 226 | { | ||
| 227 | struct dm_btree_value_type vt; | ||
| 228 | |||
| 229 | vt.context = NULL; | ||
| 230 | vt.size = sizeof(__le64); | ||
| 231 | vt.inc = NULL; | ||
| 232 | vt.dec = NULL; | ||
| 233 | vt.equal = NULL; | ||
| 234 | dm_array_info_init(&cmd->info, cmd->tm, &vt); | ||
| 235 | |||
| 236 | if (cmd->policy_hint_size) { | ||
| 237 | vt.size = sizeof(__le32); | ||
| 238 | dm_array_info_init(&cmd->hint_info, cmd->tm, &vt); | ||
| 239 | } | ||
| 240 | } | ||
| 241 | |||
| 242 | static int __write_initial_superblock(struct dm_cache_metadata *cmd) | ||
| 243 | { | ||
| 244 | int r; | ||
| 245 | struct dm_block *sblock; | ||
| 246 | size_t metadata_len; | ||
| 247 | struct cache_disk_superblock *disk_super; | ||
| 248 | sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT; | ||
| 249 | |||
| 250 | /* FIXME: see if we can lose the max sectors limit */ | ||
| 251 | if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS) | ||
| 252 | bdev_size = DM_CACHE_METADATA_MAX_SECTORS; | ||
| 253 | |||
| 254 | r = dm_sm_root_size(cmd->metadata_sm, &metadata_len); | ||
| 255 | if (r < 0) | ||
| 256 | return r; | ||
| 257 | |||
| 258 | r = dm_tm_pre_commit(cmd->tm); | ||
| 259 | if (r < 0) | ||
| 260 | return r; | ||
| 261 | |||
| 262 | r = superblock_lock_zero(cmd, &sblock); | ||
| 263 | if (r) | ||
| 264 | return r; | ||
| 265 | |||
| 266 | disk_super = dm_block_data(sblock); | ||
| 267 | disk_super->flags = 0; | ||
| 268 | memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); | ||
| 269 | disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); | ||
| 270 | disk_super->version = cpu_to_le32(CACHE_VERSION); | ||
| 271 | memset(disk_super->policy_name, 0, CACHE_POLICY_NAME_SIZE); | ||
| 272 | disk_super->policy_hint_size = 0; | ||
| 273 | |||
| 274 | r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root, | ||
| 275 | metadata_len); | ||
| 276 | if (r < 0) | ||
| 277 | goto bad_locked; | ||
| 278 | |||
| 279 | disk_super->mapping_root = cpu_to_le64(cmd->root); | ||
| 280 | disk_super->hint_root = cpu_to_le64(cmd->hint_root); | ||
| 281 | disk_super->discard_root = cpu_to_le64(cmd->discard_root); | ||
| 282 | disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); | ||
| 283 | disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); | ||
| 284 | disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); | ||
| 285 | disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); | ||
| 286 | disk_super->cache_blocks = cpu_to_le32(0); | ||
| 287 | memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); | ||
| 288 | |||
| 289 | disk_super->read_hits = cpu_to_le32(0); | ||
| 290 | disk_super->read_misses = cpu_to_le32(0); | ||
| 291 | disk_super->write_hits = cpu_to_le32(0); | ||
| 292 | disk_super->write_misses = cpu_to_le32(0); | ||
| 293 | |||
| 294 | return dm_tm_commit(cmd->tm, sblock); | ||
| 295 | |||
| 296 | bad_locked: | ||
| 297 | dm_bm_unlock(sblock); | ||
| 298 | return r; | ||
| 299 | } | ||
| 300 | |||
| 301 | static int __format_metadata(struct dm_cache_metadata *cmd) | ||
| 302 | { | ||
| 303 | int r; | ||
| 304 | |||
| 305 | r = dm_tm_create_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION, | ||
| 306 | &cmd->tm, &cmd->metadata_sm); | ||
| 307 | if (r < 0) { | ||
| 308 | DMERR("tm_create_with_sm failed"); | ||
| 309 | return r; | ||
| 310 | } | ||
| 311 | |||
| 312 | __setup_mapping_info(cmd); | ||
| 313 | |||
| 314 | r = dm_array_empty(&cmd->info, &cmd->root); | ||
| 315 | if (r < 0) | ||
| 316 | goto bad; | ||
| 317 | |||
| 318 | dm_disk_bitset_init(cmd->tm, &cmd->discard_info); | ||
| 319 | |||
| 320 | r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root); | ||
| 321 | if (r < 0) | ||
| 322 | goto bad; | ||
| 323 | |||
| 324 | cmd->discard_block_size = 0; | ||
| 325 | cmd->discard_nr_blocks = 0; | ||
| 326 | |||
| 327 | r = __write_initial_superblock(cmd); | ||
| 328 | if (r) | ||
| 329 | goto bad; | ||
| 330 | |||
| 331 | cmd->clean_when_opened = true; | ||
| 332 | return 0; | ||
| 333 | |||
| 334 | bad: | ||
| 335 | dm_tm_destroy(cmd->tm); | ||
| 336 | dm_sm_destroy(cmd->metadata_sm); | ||
| 337 | |||
| 338 | return r; | ||
| 339 | } | ||
| 340 | |||
| 341 | static int __check_incompat_features(struct cache_disk_superblock *disk_super, | ||
| 342 | struct dm_cache_metadata *cmd) | ||
| 343 | { | ||
| 344 | uint32_t features; | ||
| 345 | |||
| 346 | features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP; | ||
| 347 | if (features) { | ||
| 348 | DMERR("could not access metadata due to unsupported optional features (%lx).", | ||
| 349 | (unsigned long)features); | ||
| 350 | return -EINVAL; | ||
| 351 | } | ||
| 352 | |||
| 353 | /* | ||
| 354 | * Check for read-only metadata to skip the following RDWR checks. | ||
| 355 | */ | ||
| 356 | if (get_disk_ro(cmd->bdev->bd_disk)) | ||
| 357 | return 0; | ||
| 358 | |||
| 359 | features = le32_to_cpu(disk_super->compat_ro_flags) & ~DM_CACHE_FEATURE_COMPAT_RO_SUPP; | ||
| 360 | if (features) { | ||
| 361 | DMERR("could not access metadata RDWR due to unsupported optional features (%lx).", | ||
| 362 | (unsigned long)features); | ||
| 363 | return -EINVAL; | ||
| 364 | } | ||
| 365 | |||
| 366 | return 0; | ||
| 367 | } | ||
| 368 | |||
| 369 | static int __open_metadata(struct dm_cache_metadata *cmd) | ||
| 370 | { | ||
| 371 | int r; | ||
| 372 | struct dm_block *sblock; | ||
| 373 | struct cache_disk_superblock *disk_super; | ||
| 374 | unsigned long sb_flags; | ||
| 375 | |||
| 376 | r = superblock_read_lock(cmd, &sblock); | ||
| 377 | if (r < 0) { | ||
| 378 | DMERR("couldn't read lock superblock"); | ||
| 379 | return r; | ||
| 380 | } | ||
| 381 | |||
| 382 | disk_super = dm_block_data(sblock); | ||
| 383 | |||
| 384 | r = __check_incompat_features(disk_super, cmd); | ||
| 385 | if (r < 0) | ||
| 386 | goto bad; | ||
| 387 | |||
| 388 | r = dm_tm_open_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION, | ||
| 389 | disk_super->metadata_space_map_root, | ||
| 390 | sizeof(disk_super->metadata_space_map_root), | ||
| 391 | &cmd->tm, &cmd->metadata_sm); | ||
| 392 | if (r < 0) { | ||
| 393 | DMERR("tm_open_with_sm failed"); | ||
| 394 | goto bad; | ||
| 395 | } | ||
| 396 | |||
| 397 | __setup_mapping_info(cmd); | ||
| 398 | dm_disk_bitset_init(cmd->tm, &cmd->discard_info); | ||
| 399 | sb_flags = le32_to_cpu(disk_super->flags); | ||
| 400 | cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags); | ||
| 401 | return dm_bm_unlock(sblock); | ||
| 402 | |||
| 403 | bad: | ||
| 404 | dm_bm_unlock(sblock); | ||
| 405 | return r; | ||
| 406 | } | ||
| 407 | |||
| 408 | static int __open_or_format_metadata(struct dm_cache_metadata *cmd, | ||
| 409 | bool format_device) | ||
| 410 | { | ||
| 411 | int r, unformatted; | ||
| 412 | |||
| 413 | r = __superblock_all_zeroes(cmd->bm, &unformatted); | ||
| 414 | if (r) | ||
| 415 | return r; | ||
| 416 | |||
| 417 | if (unformatted) | ||
| 418 | return format_device ? __format_metadata(cmd) : -EPERM; | ||
| 419 | |||
| 420 | return __open_metadata(cmd); | ||
| 421 | } | ||
| 422 | |||
| 423 | static int __create_persistent_data_objects(struct dm_cache_metadata *cmd, | ||
| 424 | bool may_format_device) | ||
| 425 | { | ||
| 426 | int r; | ||
| 427 | cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE, | ||
| 428 | CACHE_METADATA_CACHE_SIZE, | ||
| 429 | CACHE_MAX_CONCURRENT_LOCKS); | ||
| 430 | if (IS_ERR(cmd->bm)) { | ||
| 431 | DMERR("could not create block manager"); | ||
| 432 | return PTR_ERR(cmd->bm); | ||
| 433 | } | ||
| 434 | |||
| 435 | r = __open_or_format_metadata(cmd, may_format_device); | ||
| 436 | if (r) | ||
| 437 | dm_block_manager_destroy(cmd->bm); | ||
| 438 | |||
| 439 | return r; | ||
| 440 | } | ||
| 441 | |||
| 442 | static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd) | ||
| 443 | { | ||
| 444 | dm_sm_destroy(cmd->metadata_sm); | ||
| 445 | dm_tm_destroy(cmd->tm); | ||
| 446 | dm_block_manager_destroy(cmd->bm); | ||
| 447 | } | ||
| 448 | |||
| 449 | typedef unsigned long (*flags_mutator)(unsigned long); | ||
| 450 | |||
| 451 | static void update_flags(struct cache_disk_superblock *disk_super, | ||
| 452 | flags_mutator mutator) | ||
| 453 | { | ||
| 454 | uint32_t sb_flags = mutator(le32_to_cpu(disk_super->flags)); | ||
| 455 | disk_super->flags = cpu_to_le32(sb_flags); | ||
| 456 | } | ||
| 457 | |||
| 458 | static unsigned long set_clean_shutdown(unsigned long flags) | ||
| 459 | { | ||
| 460 | set_bit(CLEAN_SHUTDOWN, &flags); | ||
| 461 | return flags; | ||
| 462 | } | ||
| 463 | |||
| 464 | static unsigned long clear_clean_shutdown(unsigned long flags) | ||
| 465 | { | ||
| 466 | clear_bit(CLEAN_SHUTDOWN, &flags); | ||
| 467 | return flags; | ||
| 468 | } | ||
| 469 | |||
| 470 | static void read_superblock_fields(struct dm_cache_metadata *cmd, | ||
| 471 | struct cache_disk_superblock *disk_super) | ||
| 472 | { | ||
| 473 | cmd->root = le64_to_cpu(disk_super->mapping_root); | ||
| 474 | cmd->hint_root = le64_to_cpu(disk_super->hint_root); | ||
| 475 | cmd->discard_root = le64_to_cpu(disk_super->discard_root); | ||
| 476 | cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size); | ||
| 477 | cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks)); | ||
| 478 | cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); | ||
| 479 | cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); | ||
| 480 | strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); | ||
| 481 | cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size); | ||
| 482 | |||
| 483 | cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits); | ||
| 484 | cmd->stats.read_misses = le32_to_cpu(disk_super->read_misses); | ||
| 485 | cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits); | ||
| 486 | cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses); | ||
| 487 | |||
| 488 | cmd->changed = false; | ||
| 489 | } | ||
| 490 | |||
| 491 | /* | ||
| 492 | * The mutator updates the superblock flags. | ||
| 493 | */ | ||
| 494 | static int __begin_transaction_flags(struct dm_cache_metadata *cmd, | ||
| 495 | flags_mutator mutator) | ||
| 496 | { | ||
| 497 | int r; | ||
| 498 | struct cache_disk_superblock *disk_super; | ||
| 499 | struct dm_block *sblock; | ||
| 500 | |||
| 501 | r = superblock_lock(cmd, &sblock); | ||
| 502 | if (r) | ||
| 503 | return r; | ||
| 504 | |||
| 505 | disk_super = dm_block_data(sblock); | ||
| 506 | update_flags(disk_super, mutator); | ||
| 507 | read_superblock_fields(cmd, disk_super); | ||
| 508 | |||
| 509 | return dm_bm_flush_and_unlock(cmd->bm, sblock); | ||
| 510 | } | ||
| 511 | |||
| 512 | static int __begin_transaction(struct dm_cache_metadata *cmd) | ||
| 513 | { | ||
| 514 | int r; | ||
| 515 | struct cache_disk_superblock *disk_super; | ||
| 516 | struct dm_block *sblock; | ||
| 517 | |||
| 518 | /* | ||
| 519 | * We re-read the superblock every time. Shouldn't need to do this | ||
| 520 | * really. | ||
| 521 | */ | ||
| 522 | r = superblock_read_lock(cmd, &sblock); | ||
| 523 | if (r) | ||
| 524 | return r; | ||
| 525 | |||
| 526 | disk_super = dm_block_data(sblock); | ||
| 527 | read_superblock_fields(cmd, disk_super); | ||
| 528 | dm_bm_unlock(sblock); | ||
| 529 | |||
| 530 | return 0; | ||
| 531 | } | ||
| 532 | |||
| 533 | static int __commit_transaction(struct dm_cache_metadata *cmd, | ||
| 534 | flags_mutator mutator) | ||
| 535 | { | ||
| 536 | int r; | ||
| 537 | size_t metadata_len; | ||
| 538 | struct cache_disk_superblock *disk_super; | ||
| 539 | struct dm_block *sblock; | ||
| 540 | |||
| 541 | /* | ||
| 542 | * We need to know if the cache_disk_superblock exceeds a 512-byte sector. | ||
| 543 | */ | ||
| 544 | BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512); | ||
| 545 | |||
| 546 | r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, | ||
| 547 | &cmd->discard_root); | ||
| 548 | if (r) | ||
| 549 | return r; | ||
| 550 | |||
| 551 | r = dm_tm_pre_commit(cmd->tm); | ||
| 552 | if (r < 0) | ||
| 553 | return r; | ||
| 554 | |||
| 555 | r = dm_sm_root_size(cmd->metadata_sm, &metadata_len); | ||
| 556 | if (r < 0) | ||
| 557 | return r; | ||
| 558 | |||
| 559 | r = superblock_lock(cmd, &sblock); | ||
| 560 | if (r) | ||
| 561 | return r; | ||
| 562 | |||
| 563 | disk_super = dm_block_data(sblock); | ||
| 564 | |||
| 565 | if (mutator) | ||
| 566 | update_flags(disk_super, mutator); | ||
| 567 | |||
| 568 | disk_super->mapping_root = cpu_to_le64(cmd->root); | ||
| 569 | disk_super->hint_root = cpu_to_le64(cmd->hint_root); | ||
| 570 | disk_super->discard_root = cpu_to_le64(cmd->discard_root); | ||
| 571 | disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); | ||
| 572 | disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); | ||
| 573 | disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); | ||
| 574 | strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); | ||
| 575 | |||
| 576 | disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits); | ||
| 577 | disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); | ||
| 578 | disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits); | ||
| 579 | disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses); | ||
| 580 | |||
| 581 | r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root, | ||
| 582 | metadata_len); | ||
| 583 | if (r < 0) { | ||
| 584 | dm_bm_unlock(sblock); | ||
| 585 | return r; | ||
| 586 | } | ||
| 587 | |||
| 588 | return dm_tm_commit(cmd->tm, sblock); | ||
| 589 | } | ||
| 590 | |||
| 591 | /*----------------------------------------------------------------*/ | ||
| 592 | |||
| 593 | /* | ||
| 594 | * The mappings are held in a dm-array that has 64-bit values stored in | ||
| 595 | * little-endian format. The index is the cblock, the high 48bits of the | ||
| 596 | * value are the oblock and the low 16 bit the flags. | ||
| 597 | */ | ||
| 598 | #define FLAGS_MASK ((1 << 16) - 1) | ||
| 599 | |||
| 600 | static __le64 pack_value(dm_oblock_t block, unsigned flags) | ||
| 601 | { | ||
| 602 | uint64_t value = from_oblock(block); | ||
| 603 | value <<= 16; | ||
| 604 | value = value | (flags & FLAGS_MASK); | ||
| 605 | return cpu_to_le64(value); | ||
| 606 | } | ||
| 607 | |||
| 608 | static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags) | ||
| 609 | { | ||
| 610 | uint64_t value = le64_to_cpu(value_le); | ||
| 611 | uint64_t b = value >> 16; | ||
| 612 | *block = to_oblock(b); | ||
| 613 | *flags = value & FLAGS_MASK; | ||
| 614 | } | ||
| 615 | |||
| 616 | /*----------------------------------------------------------------*/ | ||
| 617 | |||
| 618 | struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, | ||
| 619 | sector_t data_block_size, | ||
| 620 | bool may_format_device, | ||
| 621 | size_t policy_hint_size) | ||
| 622 | { | ||
| 623 | int r; | ||
| 624 | struct dm_cache_metadata *cmd; | ||
| 625 | |||
| 626 | cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); | ||
| 627 | if (!cmd) { | ||
| 628 | DMERR("could not allocate metadata struct"); | ||
| 629 | return NULL; | ||
| 630 | } | ||
| 631 | |||
| 632 | init_rwsem(&cmd->root_lock); | ||
| 633 | cmd->bdev = bdev; | ||
| 634 | cmd->data_block_size = data_block_size; | ||
| 635 | cmd->cache_blocks = 0; | ||
| 636 | cmd->policy_hint_size = policy_hint_size; | ||
| 637 | cmd->changed = true; | ||
| 638 | |||
| 639 | r = __create_persistent_data_objects(cmd, may_format_device); | ||
| 640 | if (r) { | ||
| 641 | kfree(cmd); | ||
| 642 | return ERR_PTR(r); | ||
| 643 | } | ||
| 644 | |||
| 645 | r = __begin_transaction_flags(cmd, clear_clean_shutdown); | ||
| 646 | if (r < 0) { | ||
| 647 | dm_cache_metadata_close(cmd); | ||
| 648 | return ERR_PTR(r); | ||
| 649 | } | ||
| 650 | |||
| 651 | return cmd; | ||
| 652 | } | ||
| 653 | |||
| 654 | void dm_cache_metadata_close(struct dm_cache_metadata *cmd) | ||
| 655 | { | ||
| 656 | __destroy_persistent_data_objects(cmd); | ||
| 657 | kfree(cmd); | ||
| 658 | } | ||
| 659 | |||
| 660 | int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) | ||
| 661 | { | ||
| 662 | int r; | ||
| 663 | __le64 null_mapping = pack_value(0, 0); | ||
| 664 | |||
| 665 | down_write(&cmd->root_lock); | ||
| 666 | __dm_bless_for_disk(&null_mapping); | ||
| 667 | r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks), | ||
| 668 | from_cblock(new_cache_size), | ||
| 669 | &null_mapping, &cmd->root); | ||
| 670 | if (!r) | ||
| 671 | cmd->cache_blocks = new_cache_size; | ||
| 672 | cmd->changed = true; | ||
| 673 | up_write(&cmd->root_lock); | ||
| 674 | |||
| 675 | return r; | ||
| 676 | } | ||
| 677 | |||
| 678 | int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, | ||
| 679 | sector_t discard_block_size, | ||
| 680 | dm_dblock_t new_nr_entries) | ||
| 681 | { | ||
| 682 | int r; | ||
| 683 | |||
| 684 | down_write(&cmd->root_lock); | ||
| 685 | r = dm_bitset_resize(&cmd->discard_info, | ||
| 686 | cmd->discard_root, | ||
| 687 | from_dblock(cmd->discard_nr_blocks), | ||
| 688 | from_dblock(new_nr_entries), | ||
| 689 | false, &cmd->discard_root); | ||
| 690 | if (!r) { | ||
| 691 | cmd->discard_block_size = discard_block_size; | ||
| 692 | cmd->discard_nr_blocks = new_nr_entries; | ||
| 693 | } | ||
| 694 | |||
| 695 | cmd->changed = true; | ||
| 696 | up_write(&cmd->root_lock); | ||
| 697 | |||
| 698 | return r; | ||
| 699 | } | ||
| 700 | |||
| 701 | static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b) | ||
| 702 | { | ||
| 703 | return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root, | ||
| 704 | from_dblock(b), &cmd->discard_root); | ||
| 705 | } | ||
| 706 | |||
| 707 | static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b) | ||
| 708 | { | ||
| 709 | return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root, | ||
| 710 | from_dblock(b), &cmd->discard_root); | ||
| 711 | } | ||
| 712 | |||
| 713 | static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b, | ||
| 714 | bool *is_discarded) | ||
| 715 | { | ||
| 716 | return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root, | ||
| 717 | from_dblock(b), &cmd->discard_root, | ||
| 718 | is_discarded); | ||
| 719 | } | ||
| 720 | |||
| 721 | static int __discard(struct dm_cache_metadata *cmd, | ||
| 722 | dm_dblock_t dblock, bool discard) | ||
| 723 | { | ||
| 724 | int r; | ||
| 725 | |||
| 726 | r = (discard ? __set_discard : __clear_discard)(cmd, dblock); | ||
| 727 | if (r) | ||
| 728 | return r; | ||
| 729 | |||
| 730 | cmd->changed = true; | ||
| 731 | return 0; | ||
| 732 | } | ||
| 733 | |||
| 734 | int dm_cache_set_discard(struct dm_cache_metadata *cmd, | ||
| 735 | dm_dblock_t dblock, bool discard) | ||
| 736 | { | ||
| 737 | int r; | ||
| 738 | |||
| 739 | down_write(&cmd->root_lock); | ||
| 740 | r = __discard(cmd, dblock, discard); | ||
| 741 | up_write(&cmd->root_lock); | ||
| 742 | |||
| 743 | return r; | ||
| 744 | } | ||
| 745 | |||
| 746 | static int __load_discards(struct dm_cache_metadata *cmd, | ||
| 747 | load_discard_fn fn, void *context) | ||
| 748 | { | ||
| 749 | int r = 0; | ||
| 750 | dm_block_t b; | ||
| 751 | bool discard; | ||
| 752 | |||
| 753 | for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { | ||
| 754 | dm_dblock_t dblock = to_dblock(b); | ||
| 755 | |||
| 756 | if (cmd->clean_when_opened) { | ||
| 757 | r = __is_discarded(cmd, dblock, &discard); | ||
| 758 | if (r) | ||
| 759 | return r; | ||
| 760 | } else | ||
| 761 | discard = false; | ||
| 762 | |||
| 763 | r = fn(context, cmd->discard_block_size, dblock, discard); | ||
| 764 | if (r) | ||
| 765 | break; | ||
| 766 | } | ||
| 767 | |||
| 768 | return r; | ||
| 769 | } | ||
| 770 | |||
| 771 | int dm_cache_load_discards(struct dm_cache_metadata *cmd, | ||
| 772 | load_discard_fn fn, void *context) | ||
| 773 | { | ||
| 774 | int r; | ||
| 775 | |||
| 776 | down_read(&cmd->root_lock); | ||
| 777 | r = __load_discards(cmd, fn, context); | ||
| 778 | up_read(&cmd->root_lock); | ||
| 779 | |||
| 780 | return r; | ||
| 781 | } | ||
| 782 | |||
| 783 | dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd) | ||
| 784 | { | ||
| 785 | dm_cblock_t r; | ||
| 786 | |||
| 787 | down_read(&cmd->root_lock); | ||
| 788 | r = cmd->cache_blocks; | ||
| 789 | up_read(&cmd->root_lock); | ||
| 790 | |||
| 791 | return r; | ||
| 792 | } | ||
| 793 | |||
| 794 | static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock) | ||
| 795 | { | ||
| 796 | int r; | ||
| 797 | __le64 value = pack_value(0, 0); | ||
| 798 | |||
| 799 | __dm_bless_for_disk(&value); | ||
| 800 | r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock), | ||
| 801 | &value, &cmd->root); | ||
| 802 | if (r) | ||
| 803 | return r; | ||
| 804 | |||
| 805 | cmd->changed = true; | ||
| 806 | return 0; | ||
| 807 | } | ||
| 808 | |||
| 809 | int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock) | ||
| 810 | { | ||
| 811 | int r; | ||
| 812 | |||
| 813 | down_write(&cmd->root_lock); | ||
| 814 | r = __remove(cmd, cblock); | ||
| 815 | up_write(&cmd->root_lock); | ||
| 816 | |||
| 817 | return r; | ||
| 818 | } | ||
| 819 | |||
| 820 | static int __insert(struct dm_cache_metadata *cmd, | ||
| 821 | dm_cblock_t cblock, dm_oblock_t oblock) | ||
| 822 | { | ||
| 823 | int r; | ||
| 824 | __le64 value = pack_value(oblock, M_VALID); | ||
| 825 | __dm_bless_for_disk(&value); | ||
| 826 | |||
| 827 | r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock), | ||
| 828 | &value, &cmd->root); | ||
| 829 | if (r) | ||
| 830 | return r; | ||
| 831 | |||
| 832 | cmd->changed = true; | ||
| 833 | return 0; | ||
| 834 | } | ||
| 835 | |||
| 836 | int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, | ||
| 837 | dm_cblock_t cblock, dm_oblock_t oblock) | ||
| 838 | { | ||
| 839 | int r; | ||
| 840 | |||
| 841 | down_write(&cmd->root_lock); | ||
| 842 | r = __insert(cmd, cblock, oblock); | ||
| 843 | up_write(&cmd->root_lock); | ||
| 844 | |||
| 845 | return r; | ||
| 846 | } | ||
| 847 | |||
| 848 | struct thunk { | ||
| 849 | load_mapping_fn fn; | ||
| 850 | void *context; | ||
| 851 | |||
| 852 | struct dm_cache_metadata *cmd; | ||
| 853 | bool respect_dirty_flags; | ||
| 854 | bool hints_valid; | ||
| 855 | }; | ||
| 856 | |||
| 857 | static bool hints_array_initialized(struct dm_cache_metadata *cmd) | ||
| 858 | { | ||
| 859 | return cmd->hint_root && cmd->policy_hint_size; | ||
| 860 | } | ||
| 861 | |||
| 862 | static bool hints_array_available(struct dm_cache_metadata *cmd, | ||
| 863 | const char *policy_name) | ||
| 864 | { | ||
| 865 | bool policy_names_match = !strncmp(cmd->policy_name, policy_name, | ||
| 866 | sizeof(cmd->policy_name)); | ||
| 867 | |||
| 868 | return cmd->clean_when_opened && policy_names_match && | ||
| 869 | hints_array_initialized(cmd); | ||
| 870 | } | ||
| 871 | |||
| 872 | static int __load_mapping(void *context, uint64_t cblock, void *leaf) | ||
| 873 | { | ||
| 874 | int r = 0; | ||
| 875 | bool dirty; | ||
| 876 | __le64 value; | ||
| 877 | __le32 hint_value = 0; | ||
| 878 | dm_oblock_t oblock; | ||
| 879 | unsigned flags; | ||
| 880 | struct thunk *thunk = context; | ||
| 881 | struct dm_cache_metadata *cmd = thunk->cmd; | ||
| 882 | |||
| 883 | memcpy(&value, leaf, sizeof(value)); | ||
| 884 | unpack_value(value, &oblock, &flags); | ||
| 885 | |||
| 886 | if (flags & M_VALID) { | ||
| 887 | if (thunk->hints_valid) { | ||
| 888 | r = dm_array_get_value(&cmd->hint_info, cmd->hint_root, | ||
| 889 | cblock, &hint_value); | ||
| 890 | if (r && r != -ENODATA) | ||
| 891 | return r; | ||
| 892 | } | ||
| 893 | |||
| 894 | dirty = thunk->respect_dirty_flags ? (flags & M_DIRTY) : true; | ||
| 895 | r = thunk->fn(thunk->context, oblock, to_cblock(cblock), | ||
| 896 | dirty, le32_to_cpu(hint_value), thunk->hints_valid); | ||
| 897 | } | ||
| 898 | |||
| 899 | return r; | ||
| 900 | } | ||
| 901 | |||
| 902 | static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, | ||
| 903 | load_mapping_fn fn, void *context) | ||
| 904 | { | ||
| 905 | struct thunk thunk; | ||
| 906 | |||
| 907 | thunk.fn = fn; | ||
| 908 | thunk.context = context; | ||
| 909 | |||
| 910 | thunk.cmd = cmd; | ||
| 911 | thunk.respect_dirty_flags = cmd->clean_when_opened; | ||
| 912 | thunk.hints_valid = hints_array_available(cmd, policy_name); | ||
| 913 | |||
| 914 | return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk); | ||
| 915 | } | ||
| 916 | |||
| 917 | int dm_cache_load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, | ||
| 918 | load_mapping_fn fn, void *context) | ||
| 919 | { | ||
| 920 | int r; | ||
| 921 | |||
| 922 | down_read(&cmd->root_lock); | ||
| 923 | r = __load_mappings(cmd, policy_name, fn, context); | ||
| 924 | up_read(&cmd->root_lock); | ||
| 925 | |||
| 926 | return r; | ||
| 927 | } | ||
| 928 | |||
| 929 | static int __dump_mapping(void *context, uint64_t cblock, void *leaf) | ||
| 930 | { | ||
| 931 | int r = 0; | ||
| 932 | __le64 value; | ||
| 933 | dm_oblock_t oblock; | ||
| 934 | unsigned flags; | ||
| 935 | |||
| 936 | memcpy(&value, leaf, sizeof(value)); | ||
| 937 | unpack_value(value, &oblock, &flags); | ||
| 938 | |||
| 939 | return r; | ||
| 940 | } | ||
| 941 | |||
| 942 | static int __dump_mappings(struct dm_cache_metadata *cmd) | ||
| 943 | { | ||
| 944 | return dm_array_walk(&cmd->info, cmd->root, __dump_mapping, NULL); | ||
| 945 | } | ||
| 946 | |||
| 947 | void dm_cache_dump(struct dm_cache_metadata *cmd) | ||
| 948 | { | ||
| 949 | down_read(&cmd->root_lock); | ||
| 950 | __dump_mappings(cmd); | ||
| 951 | up_read(&cmd->root_lock); | ||
| 952 | } | ||
| 953 | |||
| 954 | int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd) | ||
| 955 | { | ||
| 956 | int r; | ||
| 957 | |||
| 958 | down_read(&cmd->root_lock); | ||
| 959 | r = cmd->changed; | ||
| 960 | up_read(&cmd->root_lock); | ||
| 961 | |||
| 962 | return r; | ||
| 963 | } | ||
| 964 | |||
| 965 | static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty) | ||
| 966 | { | ||
| 967 | int r; | ||
| 968 | unsigned flags; | ||
| 969 | dm_oblock_t oblock; | ||
| 970 | __le64 value; | ||
| 971 | |||
| 972 | r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(cblock), &value); | ||
| 973 | if (r) | ||
| 974 | return r; | ||
| 975 | |||
| 976 | unpack_value(value, &oblock, &flags); | ||
| 977 | |||
| 978 | if (((flags & M_DIRTY) && dirty) || (!(flags & M_DIRTY) && !dirty)) | ||
| 979 | /* nothing to be done */ | ||
| 980 | return 0; | ||
| 981 | |||
| 982 | value = pack_value(oblock, flags | (dirty ? M_DIRTY : 0)); | ||
| 983 | __dm_bless_for_disk(&value); | ||
| 984 | |||
| 985 | r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock), | ||
| 986 | &value, &cmd->root); | ||
| 987 | if (r) | ||
| 988 | return r; | ||
| 989 | |||
| 990 | cmd->changed = true; | ||
| 991 | return 0; | ||
| 992 | |||
| 993 | } | ||
| 994 | |||
| 995 | int dm_cache_set_dirty(struct dm_cache_metadata *cmd, | ||
| 996 | dm_cblock_t cblock, bool dirty) | ||
| 997 | { | ||
| 998 | int r; | ||
| 999 | |||
| 1000 | down_write(&cmd->root_lock); | ||
| 1001 | r = __dirty(cmd, cblock, dirty); | ||
| 1002 | up_write(&cmd->root_lock); | ||
| 1003 | |||
| 1004 | return r; | ||
| 1005 | } | ||
| 1006 | |||
| 1007 | void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd, | ||
| 1008 | struct dm_cache_statistics *stats) | ||
| 1009 | { | ||
| 1010 | down_read(&cmd->root_lock); | ||
| 1011 | memcpy(stats, &cmd->stats, sizeof(*stats)); | ||
| 1012 | up_read(&cmd->root_lock); | ||
| 1013 | } | ||
| 1014 | |||
| 1015 | void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd, | ||
| 1016 | struct dm_cache_statistics *stats) | ||
| 1017 | { | ||
| 1018 | down_write(&cmd->root_lock); | ||
| 1019 | memcpy(&cmd->stats, stats, sizeof(*stats)); | ||
| 1020 | up_write(&cmd->root_lock); | ||
| 1021 | } | ||
| 1022 | |||
| 1023 | int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown) | ||
| 1024 | { | ||
| 1025 | int r; | ||
| 1026 | flags_mutator mutator = (clean_shutdown ? set_clean_shutdown : | ||
| 1027 | clear_clean_shutdown); | ||
| 1028 | |||
| 1029 | down_write(&cmd->root_lock); | ||
| 1030 | r = __commit_transaction(cmd, mutator); | ||
| 1031 | if (r) | ||
| 1032 | goto out; | ||
| 1033 | |||
| 1034 | r = __begin_transaction(cmd); | ||
| 1035 | |||
| 1036 | out: | ||
| 1037 | up_write(&cmd->root_lock); | ||
| 1038 | return r; | ||
| 1039 | } | ||
| 1040 | |||
| 1041 | int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd, | ||
| 1042 | dm_block_t *result) | ||
| 1043 | { | ||
| 1044 | int r = -EINVAL; | ||
| 1045 | |||
| 1046 | down_read(&cmd->root_lock); | ||
| 1047 | r = dm_sm_get_nr_free(cmd->metadata_sm, result); | ||
| 1048 | up_read(&cmd->root_lock); | ||
| 1049 | |||
| 1050 | return r; | ||
| 1051 | } | ||
| 1052 | |||
| 1053 | int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd, | ||
| 1054 | dm_block_t *result) | ||
| 1055 | { | ||
| 1056 | int r = -EINVAL; | ||
| 1057 | |||
| 1058 | down_read(&cmd->root_lock); | ||
| 1059 | r = dm_sm_get_nr_blocks(cmd->metadata_sm, result); | ||
| 1060 | up_read(&cmd->root_lock); | ||
| 1061 | |||
| 1062 | return r; | ||
| 1063 | } | ||
| 1064 | |||
| 1065 | /*----------------------------------------------------------------*/ | ||
| 1066 | |||
| 1067 | static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) | ||
| 1068 | { | ||
| 1069 | int r; | ||
| 1070 | __le32 value; | ||
| 1071 | size_t hint_size; | ||
| 1072 | const char *policy_name = dm_cache_policy_get_name(policy); | ||
| 1073 | |||
| 1074 | if (!policy_name[0] || | ||
| 1075 | (strlen(policy_name) > sizeof(cmd->policy_name) - 1)) | ||
| 1076 | return -EINVAL; | ||
| 1077 | |||
| 1078 | if (strcmp(cmd->policy_name, policy_name)) { | ||
| 1079 | strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); | ||
| 1080 | |||
| 1081 | hint_size = dm_cache_policy_get_hint_size(policy); | ||
| 1082 | if (!hint_size) | ||
| 1083 | return 0; /* short-circuit hints initialization */ | ||
| 1084 | cmd->policy_hint_size = hint_size; | ||
| 1085 | |||
| 1086 | if (cmd->hint_root) { | ||
| 1087 | r = dm_array_del(&cmd->hint_info, cmd->hint_root); | ||
| 1088 | if (r) | ||
| 1089 | return r; | ||
| 1090 | } | ||
| 1091 | |||
| 1092 | r = dm_array_empty(&cmd->hint_info, &cmd->hint_root); | ||
| 1093 | if (r) | ||
| 1094 | return r; | ||
| 1095 | |||
| 1096 | value = cpu_to_le32(0); | ||
| 1097 | __dm_bless_for_disk(&value); | ||
| 1098 | r = dm_array_resize(&cmd->hint_info, cmd->hint_root, 0, | ||
| 1099 | from_cblock(cmd->cache_blocks), | ||
| 1100 | &value, &cmd->hint_root); | ||
| 1101 | if (r) | ||
| 1102 | return r; | ||
| 1103 | } | ||
| 1104 | |||
| 1105 | return 0; | ||
| 1106 | } | ||
| 1107 | |||
| 1108 | int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) | ||
| 1109 | { | ||
| 1110 | int r; | ||
| 1111 | |||
| 1112 | down_write(&cmd->root_lock); | ||
| 1113 | r = begin_hints(cmd, policy); | ||
| 1114 | up_write(&cmd->root_lock); | ||
| 1115 | |||
| 1116 | return r; | ||
| 1117 | } | ||
| 1118 | |||
| 1119 | static int save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock, | ||
| 1120 | uint32_t hint) | ||
| 1121 | { | ||
| 1122 | int r; | ||
| 1123 | __le32 value = cpu_to_le32(hint); | ||
| 1124 | __dm_bless_for_disk(&value); | ||
| 1125 | |||
| 1126 | r = dm_array_set_value(&cmd->hint_info, cmd->hint_root, | ||
| 1127 | from_cblock(cblock), &value, &cmd->hint_root); | ||
| 1128 | cmd->changed = true; | ||
| 1129 | |||
| 1130 | return r; | ||
| 1131 | } | ||
| 1132 | |||
| 1133 | int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock, | ||
| 1134 | uint32_t hint) | ||
| 1135 | { | ||
| 1136 | int r; | ||
| 1137 | |||
| 1138 | if (!hints_array_initialized(cmd)) | ||
| 1139 | return 0; | ||
| 1140 | |||
| 1141 | down_write(&cmd->root_lock); | ||
| 1142 | r = save_hint(cmd, cblock, hint); | ||
| 1143 | up_write(&cmd->root_lock); | ||
| 1144 | |||
| 1145 | return r; | ||
| 1146 | } | ||
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h new file mode 100644 index 000000000000..135864ea0eee --- /dev/null +++ b/drivers/md/dm-cache-metadata.h | |||
| @@ -0,0 +1,142 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * This file is released under the GPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #ifndef DM_CACHE_METADATA_H | ||
| 8 | #define DM_CACHE_METADATA_H | ||
| 9 | |||
| 10 | #include "dm-cache-block-types.h" | ||
| 11 | #include "dm-cache-policy-internal.h" | ||
| 12 | |||
| 13 | /*----------------------------------------------------------------*/ | ||
| 14 | |||
| 15 | #define DM_CACHE_METADATA_BLOCK_SIZE 4096 | ||
| 16 | |||
| 17 | /* FIXME: remove this restriction */ | ||
| 18 | /* | ||
| 19 | * The metadata device is currently limited in size. | ||
| 20 | * | ||
| 21 | * We have one block of index, which can hold 255 index entries. Each | ||
| 22 | * index entry contains allocation info about 16k metadata blocks. | ||
| 23 | */ | ||
| 24 | #define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) | ||
| 25 | |||
| 26 | /* | ||
| 27 | * A metadata device larger than 16GB triggers a warning. | ||
| 28 | */ | ||
| 29 | #define DM_CACHE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT)) | ||
| 30 | |||
| 31 | /*----------------------------------------------------------------*/ | ||
| 32 | |||
| 33 | /* | ||
| 34 | * Ext[234]-style compat feature flags. | ||
| 35 | * | ||
| 36 | * A new feature which old metadata will still be compatible with should | ||
| 37 | * define a DM_CACHE_FEATURE_COMPAT_* flag (rarely useful). | ||
| 38 | * | ||
| 39 | * A new feature that is not compatible with old code should define a | ||
| 40 | * DM_CACHE_FEATURE_INCOMPAT_* flag and guard the relevant code with | ||
| 41 | * that flag. | ||
| 42 | * | ||
| 43 | * A new feature that is not compatible with old code accessing the | ||
| 44 | * metadata RDWR should define a DM_CACHE_FEATURE_RO_COMPAT_* flag and | ||
| 45 | * guard the relevant code with that flag. | ||
| 46 | * | ||
| 47 | * As these various flags are defined they should be added to the | ||
| 48 | * following masks. | ||
| 49 | */ | ||
| 50 | #define DM_CACHE_FEATURE_COMPAT_SUPP 0UL | ||
| 51 | #define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL | ||
| 52 | #define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL | ||
| 53 | |||
| 54 | /* | ||
| 55 | * Reopens or creates a new, empty metadata volume. | ||
| 56 | * Returns an ERR_PTR on failure. | ||
| 57 | */ | ||
| 58 | struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, | ||
| 59 | sector_t data_block_size, | ||
| 60 | bool may_format_device, | ||
| 61 | size_t policy_hint_size); | ||
| 62 | |||
| 63 | void dm_cache_metadata_close(struct dm_cache_metadata *cmd); | ||
| 64 | |||
| 65 | /* | ||
| 66 | * The metadata needs to know how many cache blocks there are. We don't | ||
| 67 | * care about the origin, assuming the core target is giving us valid | ||
| 68 | * origin blocks to map to. | ||
| 69 | */ | ||
| 70 | int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size); | ||
| 71 | dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd); | ||
| 72 | |||
| 73 | int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, | ||
| 74 | sector_t discard_block_size, | ||
| 75 | dm_dblock_t new_nr_entries); | ||
| 76 | |||
| 77 | typedef int (*load_discard_fn)(void *context, sector_t discard_block_size, | ||
| 78 | dm_dblock_t dblock, bool discarded); | ||
| 79 | int dm_cache_load_discards(struct dm_cache_metadata *cmd, | ||
| 80 | load_discard_fn fn, void *context); | ||
| 81 | |||
| 82 | int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard); | ||
| 83 | |||
| 84 | int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock); | ||
| 85 | int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock); | ||
| 86 | int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd); | ||
| 87 | |||
| 88 | typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock, | ||
| 89 | dm_cblock_t cblock, bool dirty, | ||
| 90 | uint32_t hint, bool hint_valid); | ||
| 91 | int dm_cache_load_mappings(struct dm_cache_metadata *cmd, | ||
| 92 | const char *policy_name, | ||
| 93 | load_mapping_fn fn, | ||
| 94 | void *context); | ||
| 95 | |||
| 96 | int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty); | ||
| 97 | |||
| 98 | struct dm_cache_statistics { | ||
| 99 | uint32_t read_hits; | ||
| 100 | uint32_t read_misses; | ||
| 101 | uint32_t write_hits; | ||
| 102 | uint32_t write_misses; | ||
| 103 | }; | ||
| 104 | |||
| 105 | void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd, | ||
| 106 | struct dm_cache_statistics *stats); | ||
| 107 | void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd, | ||
| 108 | struct dm_cache_statistics *stats); | ||
| 109 | |||
| 110 | int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown); | ||
| 111 | |||
| 112 | int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd, | ||
| 113 | dm_block_t *result); | ||
| 114 | |||
| 115 | int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd, | ||
| 116 | dm_block_t *result); | ||
| 117 | |||
| 118 | void dm_cache_dump(struct dm_cache_metadata *cmd); | ||
| 119 | |||
| 120 | /* | ||
| 121 | * The policy is invited to save a 32bit hint value for every cblock (eg, | ||
| 122 | * for a hit count). These are stored against the policy name. If | ||
| 123 | * policies are changed, then hints will be lost. If the machine crashes, | ||
| 124 | * hints will be lost. | ||
| 125 | * | ||
| 126 | * The hints are indexed by the cblock, but many policies will not | ||
| 127 | * neccessarily have a fast way of accessing efficiently via cblock. So | ||
| 128 | * rather than querying the policy for each cblock, we let it walk its data | ||
| 129 | * structures and fill in the hints in whatever order it wishes. | ||
| 130 | */ | ||
| 131 | |||
| 132 | int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p); | ||
| 133 | |||
| 134 | /* | ||
| 135 | * requests hints for every cblock and stores in the metadata device. | ||
| 136 | */ | ||
| 137 | int dm_cache_save_hint(struct dm_cache_metadata *cmd, | ||
| 138 | dm_cblock_t cblock, uint32_t hint); | ||
| 139 | |||
| 140 | /*----------------------------------------------------------------*/ | ||
| 141 | |||
| 142 | #endif /* DM_CACHE_METADATA_H */ | ||
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c new file mode 100644 index 000000000000..cc05d70b3cb8 --- /dev/null +++ b/drivers/md/dm-cache-policy-cleaner.c | |||
| @@ -0,0 +1,464 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat. All rights reserved. | ||
| 3 | * | ||
| 4 | * writeback cache policy supporting flushing out dirty cache blocks. | ||
| 5 | * | ||
| 6 | * This file is released under the GPL. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include "dm-cache-policy.h" | ||
| 10 | #include "dm.h" | ||
| 11 | |||
| 12 | #include <linux/hash.h> | ||
| 13 | #include <linux/module.h> | ||
| 14 | #include <linux/slab.h> | ||
| 15 | #include <linux/vmalloc.h> | ||
| 16 | |||
| 17 | /*----------------------------------------------------------------*/ | ||
| 18 | |||
| 19 | #define DM_MSG_PREFIX "cache cleaner" | ||
| 20 | #define CLEANER_VERSION "1.0.0" | ||
| 21 | |||
| 22 | /* Cache entry struct. */ | ||
| 23 | struct wb_cache_entry { | ||
| 24 | struct list_head list; | ||
| 25 | struct hlist_node hlist; | ||
| 26 | |||
| 27 | dm_oblock_t oblock; | ||
| 28 | dm_cblock_t cblock; | ||
| 29 | bool dirty:1; | ||
| 30 | bool pending:1; | ||
| 31 | }; | ||
| 32 | |||
| 33 | struct hash { | ||
| 34 | struct hlist_head *table; | ||
| 35 | dm_block_t hash_bits; | ||
| 36 | unsigned nr_buckets; | ||
| 37 | }; | ||
| 38 | |||
| 39 | struct policy { | ||
| 40 | struct dm_cache_policy policy; | ||
| 41 | spinlock_t lock; | ||
| 42 | |||
| 43 | struct list_head free; | ||
| 44 | struct list_head clean; | ||
| 45 | struct list_head clean_pending; | ||
| 46 | struct list_head dirty; | ||
| 47 | |||
| 48 | /* | ||
| 49 | * We know exactly how many cblocks will be needed, | ||
| 50 | * so we can allocate them up front. | ||
| 51 | */ | ||
| 52 | dm_cblock_t cache_size, nr_cblocks_allocated; | ||
| 53 | struct wb_cache_entry *cblocks; | ||
| 54 | struct hash chash; | ||
| 55 | }; | ||
| 56 | |||
| 57 | /*----------------------------------------------------------------------------*/ | ||
| 58 | |||
| 59 | /* | ||
| 60 | * Low-level functions. | ||
| 61 | */ | ||
| 62 | static unsigned next_power(unsigned n, unsigned min) | ||
| 63 | { | ||
| 64 | return roundup_pow_of_two(max(n, min)); | ||
| 65 | } | ||
| 66 | |||
| 67 | static struct policy *to_policy(struct dm_cache_policy *p) | ||
| 68 | { | ||
| 69 | return container_of(p, struct policy, policy); | ||
| 70 | } | ||
| 71 | |||
| 72 | static struct list_head *list_pop(struct list_head *q) | ||
| 73 | { | ||
| 74 | struct list_head *r = q->next; | ||
| 75 | |||
| 76 | list_del(r); | ||
| 77 | |||
| 78 | return r; | ||
| 79 | } | ||
| 80 | |||
| 81 | /*----------------------------------------------------------------------------*/ | ||
| 82 | |||
| 83 | /* Allocate/free various resources. */ | ||
| 84 | static int alloc_hash(struct hash *hash, unsigned elts) | ||
| 85 | { | ||
| 86 | hash->nr_buckets = next_power(elts >> 4, 16); | ||
| 87 | hash->hash_bits = ffs(hash->nr_buckets) - 1; | ||
| 88 | hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets); | ||
| 89 | |||
| 90 | return hash->table ? 0 : -ENOMEM; | ||
| 91 | } | ||
| 92 | |||
| 93 | static void free_hash(struct hash *hash) | ||
| 94 | { | ||
| 95 | vfree(hash->table); | ||
| 96 | } | ||
| 97 | |||
| 98 | static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size) | ||
| 99 | { | ||
| 100 | int r = -ENOMEM; | ||
| 101 | |||
| 102 | p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size)); | ||
| 103 | if (p->cblocks) { | ||
| 104 | unsigned u = from_cblock(cache_size); | ||
| 105 | |||
| 106 | while (u--) | ||
| 107 | list_add(&p->cblocks[u].list, &p->free); | ||
| 108 | |||
| 109 | p->nr_cblocks_allocated = 0; | ||
| 110 | |||
| 111 | /* Cache entries hash. */ | ||
| 112 | r = alloc_hash(&p->chash, from_cblock(cache_size)); | ||
| 113 | if (r) | ||
| 114 | vfree(p->cblocks); | ||
| 115 | } | ||
| 116 | |||
| 117 | return r; | ||
| 118 | } | ||
| 119 | |||
| 120 | static void free_cache_blocks_and_hash(struct policy *p) | ||
| 121 | { | ||
| 122 | free_hash(&p->chash); | ||
| 123 | vfree(p->cblocks); | ||
| 124 | } | ||
| 125 | |||
| 126 | static struct wb_cache_entry *alloc_cache_entry(struct policy *p) | ||
| 127 | { | ||
| 128 | struct wb_cache_entry *e; | ||
| 129 | |||
| 130 | BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size)); | ||
| 131 | |||
| 132 | e = list_entry(list_pop(&p->free), struct wb_cache_entry, list); | ||
| 133 | p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1); | ||
| 134 | |||
| 135 | return e; | ||
| 136 | } | ||
| 137 | |||
| 138 | /*----------------------------------------------------------------------------*/ | ||
| 139 | |||
| 140 | /* Hash functions (lookup, insert, remove). */ | ||
| 141 | static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock) | ||
| 142 | { | ||
| 143 | struct hash *hash = &p->chash; | ||
| 144 | unsigned h = hash_64(from_oblock(oblock), hash->hash_bits); | ||
| 145 | struct wb_cache_entry *cur; | ||
| 146 | struct hlist_head *bucket = &hash->table[h]; | ||
| 147 | |||
| 148 | hlist_for_each_entry(cur, bucket, hlist) { | ||
| 149 | if (cur->oblock == oblock) { | ||
| 150 | /* Move upfront bucket for faster access. */ | ||
| 151 | hlist_del(&cur->hlist); | ||
| 152 | hlist_add_head(&cur->hlist, bucket); | ||
| 153 | return cur; | ||
| 154 | } | ||
| 155 | } | ||
| 156 | |||
| 157 | return NULL; | ||
| 158 | } | ||
| 159 | |||
| 160 | static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e) | ||
| 161 | { | ||
| 162 | unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits); | ||
| 163 | |||
| 164 | hlist_add_head(&e->hlist, &p->chash.table[h]); | ||
| 165 | } | ||
| 166 | |||
| 167 | static void remove_cache_hash_entry(struct wb_cache_entry *e) | ||
| 168 | { | ||
| 169 | hlist_del(&e->hlist); | ||
| 170 | } | ||
| 171 | |||
| 172 | /* Public interface (see dm-cache-policy.h */ | ||
| 173 | static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock, | ||
| 174 | bool can_block, bool can_migrate, bool discarded_oblock, | ||
| 175 | struct bio *bio, struct policy_result *result) | ||
| 176 | { | ||
| 177 | struct policy *p = to_policy(pe); | ||
| 178 | struct wb_cache_entry *e; | ||
| 179 | unsigned long flags; | ||
| 180 | |||
| 181 | result->op = POLICY_MISS; | ||
| 182 | |||
| 183 | if (can_block) | ||
| 184 | spin_lock_irqsave(&p->lock, flags); | ||
| 185 | |||
| 186 | else if (!spin_trylock_irqsave(&p->lock, flags)) | ||
| 187 | return -EWOULDBLOCK; | ||
| 188 | |||
| 189 | e = lookup_cache_entry(p, oblock); | ||
| 190 | if (e) { | ||
| 191 | result->op = POLICY_HIT; | ||
| 192 | result->cblock = e->cblock; | ||
| 193 | |||
| 194 | } | ||
| 195 | |||
| 196 | spin_unlock_irqrestore(&p->lock, flags); | ||
| 197 | |||
| 198 | return 0; | ||
| 199 | } | ||
| 200 | |||
| 201 | static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock) | ||
| 202 | { | ||
| 203 | int r; | ||
| 204 | struct policy *p = to_policy(pe); | ||
| 205 | struct wb_cache_entry *e; | ||
| 206 | unsigned long flags; | ||
| 207 | |||
| 208 | if (!spin_trylock_irqsave(&p->lock, flags)) | ||
| 209 | return -EWOULDBLOCK; | ||
| 210 | |||
| 211 | e = lookup_cache_entry(p, oblock); | ||
| 212 | if (e) { | ||
| 213 | *cblock = e->cblock; | ||
| 214 | r = 0; | ||
| 215 | |||
| 216 | } else | ||
| 217 | r = -ENOENT; | ||
| 218 | |||
| 219 | spin_unlock_irqrestore(&p->lock, flags); | ||
| 220 | |||
| 221 | return r; | ||
| 222 | } | ||
| 223 | |||
| 224 | static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set) | ||
| 225 | { | ||
| 226 | struct policy *p = to_policy(pe); | ||
| 227 | struct wb_cache_entry *e; | ||
| 228 | |||
| 229 | e = lookup_cache_entry(p, oblock); | ||
| 230 | BUG_ON(!e); | ||
| 231 | |||
| 232 | if (set) { | ||
| 233 | if (!e->dirty) { | ||
| 234 | e->dirty = true; | ||
| 235 | list_move(&e->list, &p->dirty); | ||
| 236 | } | ||
| 237 | |||
| 238 | } else { | ||
| 239 | if (e->dirty) { | ||
| 240 | e->pending = false; | ||
| 241 | e->dirty = false; | ||
| 242 | list_move(&e->list, &p->clean); | ||
| 243 | } | ||
| 244 | } | ||
| 245 | } | ||
| 246 | |||
| 247 | static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) | ||
| 248 | { | ||
| 249 | struct policy *p = to_policy(pe); | ||
| 250 | unsigned long flags; | ||
| 251 | |||
| 252 | spin_lock_irqsave(&p->lock, flags); | ||
| 253 | __set_clear_dirty(pe, oblock, true); | ||
| 254 | spin_unlock_irqrestore(&p->lock, flags); | ||
| 255 | } | ||
| 256 | |||
| 257 | static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) | ||
| 258 | { | ||
| 259 | struct policy *p = to_policy(pe); | ||
| 260 | unsigned long flags; | ||
| 261 | |||
| 262 | spin_lock_irqsave(&p->lock, flags); | ||
| 263 | __set_clear_dirty(pe, oblock, false); | ||
| 264 | spin_unlock_irqrestore(&p->lock, flags); | ||
| 265 | } | ||
| 266 | |||
| 267 | static void add_cache_entry(struct policy *p, struct wb_cache_entry *e) | ||
| 268 | { | ||
| 269 | insert_cache_hash_entry(p, e); | ||
| 270 | if (e->dirty) | ||
| 271 | list_add(&e->list, &p->dirty); | ||
| 272 | else | ||
| 273 | list_add(&e->list, &p->clean); | ||
| 274 | } | ||
| 275 | |||
| 276 | static int wb_load_mapping(struct dm_cache_policy *pe, | ||
| 277 | dm_oblock_t oblock, dm_cblock_t cblock, | ||
| 278 | uint32_t hint, bool hint_valid) | ||
| 279 | { | ||
| 280 | int r; | ||
| 281 | struct policy *p = to_policy(pe); | ||
| 282 | struct wb_cache_entry *e = alloc_cache_entry(p); | ||
| 283 | |||
| 284 | if (e) { | ||
| 285 | e->cblock = cblock; | ||
| 286 | e->oblock = oblock; | ||
| 287 | e->dirty = false; /* blocks default to clean */ | ||
| 288 | add_cache_entry(p, e); | ||
| 289 | r = 0; | ||
| 290 | |||
| 291 | } else | ||
| 292 | r = -ENOMEM; | ||
| 293 | |||
| 294 | return r; | ||
| 295 | } | ||
| 296 | |||
| 297 | static void wb_destroy(struct dm_cache_policy *pe) | ||
| 298 | { | ||
| 299 | struct policy *p = to_policy(pe); | ||
| 300 | |||
| 301 | free_cache_blocks_and_hash(p); | ||
| 302 | kfree(p); | ||
| 303 | } | ||
| 304 | |||
| 305 | static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock) | ||
| 306 | { | ||
| 307 | struct wb_cache_entry *r = lookup_cache_entry(p, oblock); | ||
| 308 | |||
| 309 | BUG_ON(!r); | ||
| 310 | |||
| 311 | remove_cache_hash_entry(r); | ||
| 312 | list_del(&r->list); | ||
| 313 | |||
| 314 | return r; | ||
| 315 | } | ||
| 316 | |||
| 317 | static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock) | ||
| 318 | { | ||
| 319 | struct policy *p = to_policy(pe); | ||
| 320 | struct wb_cache_entry *e; | ||
| 321 | unsigned long flags; | ||
| 322 | |||
| 323 | spin_lock_irqsave(&p->lock, flags); | ||
| 324 | e = __wb_force_remove_mapping(p, oblock); | ||
| 325 | list_add_tail(&e->list, &p->free); | ||
| 326 | BUG_ON(!from_cblock(p->nr_cblocks_allocated)); | ||
| 327 | p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1); | ||
| 328 | spin_unlock_irqrestore(&p->lock, flags); | ||
| 329 | } | ||
| 330 | |||
| 331 | static void wb_force_mapping(struct dm_cache_policy *pe, | ||
| 332 | dm_oblock_t current_oblock, dm_oblock_t oblock) | ||
| 333 | { | ||
| 334 | struct policy *p = to_policy(pe); | ||
| 335 | struct wb_cache_entry *e; | ||
| 336 | unsigned long flags; | ||
| 337 | |||
| 338 | spin_lock_irqsave(&p->lock, flags); | ||
| 339 | e = __wb_force_remove_mapping(p, current_oblock); | ||
| 340 | e->oblock = oblock; | ||
| 341 | add_cache_entry(p, e); | ||
| 342 | spin_unlock_irqrestore(&p->lock, flags); | ||
| 343 | } | ||
| 344 | |||
| 345 | static struct wb_cache_entry *get_next_dirty_entry(struct policy *p) | ||
| 346 | { | ||
| 347 | struct list_head *l; | ||
| 348 | struct wb_cache_entry *r; | ||
| 349 | |||
| 350 | if (list_empty(&p->dirty)) | ||
| 351 | return NULL; | ||
| 352 | |||
| 353 | l = list_pop(&p->dirty); | ||
| 354 | r = container_of(l, struct wb_cache_entry, list); | ||
| 355 | list_add(l, &p->clean_pending); | ||
| 356 | |||
| 357 | return r; | ||
| 358 | } | ||
| 359 | |||
| 360 | static int wb_writeback_work(struct dm_cache_policy *pe, | ||
| 361 | dm_oblock_t *oblock, | ||
| 362 | dm_cblock_t *cblock) | ||
| 363 | { | ||
| 364 | int r = -ENOENT; | ||
| 365 | struct policy *p = to_policy(pe); | ||
| 366 | struct wb_cache_entry *e; | ||
| 367 | unsigned long flags; | ||
| 368 | |||
| 369 | spin_lock_irqsave(&p->lock, flags); | ||
| 370 | |||
| 371 | e = get_next_dirty_entry(p); | ||
| 372 | if (e) { | ||
| 373 | *oblock = e->oblock; | ||
| 374 | *cblock = e->cblock; | ||
| 375 | r = 0; | ||
| 376 | } | ||
| 377 | |||
| 378 | spin_unlock_irqrestore(&p->lock, flags); | ||
| 379 | |||
| 380 | return r; | ||
| 381 | } | ||
| 382 | |||
| 383 | static dm_cblock_t wb_residency(struct dm_cache_policy *pe) | ||
| 384 | { | ||
| 385 | return to_policy(pe)->nr_cblocks_allocated; | ||
| 386 | } | ||
| 387 | |||
| 388 | /* Init the policy plugin interface function pointers. */ | ||
| 389 | static void init_policy_functions(struct policy *p) | ||
| 390 | { | ||
| 391 | p->policy.destroy = wb_destroy; | ||
| 392 | p->policy.map = wb_map; | ||
| 393 | p->policy.lookup = wb_lookup; | ||
| 394 | p->policy.set_dirty = wb_set_dirty; | ||
| 395 | p->policy.clear_dirty = wb_clear_dirty; | ||
| 396 | p->policy.load_mapping = wb_load_mapping; | ||
| 397 | p->policy.walk_mappings = NULL; | ||
| 398 | p->policy.remove_mapping = wb_remove_mapping; | ||
| 399 | p->policy.writeback_work = wb_writeback_work; | ||
| 400 | p->policy.force_mapping = wb_force_mapping; | ||
| 401 | p->policy.residency = wb_residency; | ||
| 402 | p->policy.tick = NULL; | ||
| 403 | } | ||
| 404 | |||
| 405 | static struct dm_cache_policy *wb_create(dm_cblock_t cache_size, | ||
| 406 | sector_t origin_size, | ||
| 407 | sector_t cache_block_size) | ||
| 408 | { | ||
| 409 | int r; | ||
| 410 | struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL); | ||
| 411 | |||
| 412 | if (!p) | ||
| 413 | return NULL; | ||
| 414 | |||
| 415 | init_policy_functions(p); | ||
| 416 | INIT_LIST_HEAD(&p->free); | ||
| 417 | INIT_LIST_HEAD(&p->clean); | ||
| 418 | INIT_LIST_HEAD(&p->clean_pending); | ||
| 419 | INIT_LIST_HEAD(&p->dirty); | ||
| 420 | |||
| 421 | p->cache_size = cache_size; | ||
| 422 | spin_lock_init(&p->lock); | ||
| 423 | |||
| 424 | /* Allocate cache entry structs and add them to free list. */ | ||
| 425 | r = alloc_cache_blocks_with_hash(p, cache_size); | ||
| 426 | if (!r) | ||
| 427 | return &p->policy; | ||
| 428 | |||
| 429 | kfree(p); | ||
| 430 | |||
| 431 | return NULL; | ||
| 432 | } | ||
| 433 | /*----------------------------------------------------------------------------*/ | ||
| 434 | |||
| 435 | static struct dm_cache_policy_type wb_policy_type = { | ||
| 436 | .name = "cleaner", | ||
| 437 | .hint_size = 0, | ||
| 438 | .owner = THIS_MODULE, | ||
| 439 | .create = wb_create | ||
| 440 | }; | ||
| 441 | |||
| 442 | static int __init wb_init(void) | ||
| 443 | { | ||
| 444 | int r = dm_cache_policy_register(&wb_policy_type); | ||
| 445 | |||
| 446 | if (r < 0) | ||
| 447 | DMERR("register failed %d", r); | ||
| 448 | else | ||
| 449 | DMINFO("version " CLEANER_VERSION " loaded"); | ||
| 450 | |||
| 451 | return r; | ||
| 452 | } | ||
| 453 | |||
| 454 | static void __exit wb_exit(void) | ||
| 455 | { | ||
| 456 | dm_cache_policy_unregister(&wb_policy_type); | ||
| 457 | } | ||
| 458 | |||
| 459 | module_init(wb_init); | ||
| 460 | module_exit(wb_exit); | ||
| 461 | |||
| 462 | MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>"); | ||
| 463 | MODULE_LICENSE("GPL"); | ||
| 464 | MODULE_DESCRIPTION("cleaner cache policy"); | ||
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h new file mode 100644 index 000000000000..52a75beeced5 --- /dev/null +++ b/drivers/md/dm-cache-policy-internal.h | |||
| @@ -0,0 +1,124 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat. All rights reserved. | ||
| 3 | * | ||
| 4 | * This file is released under the GPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #ifndef DM_CACHE_POLICY_INTERNAL_H | ||
| 8 | #define DM_CACHE_POLICY_INTERNAL_H | ||
| 9 | |||
| 10 | #include "dm-cache-policy.h" | ||
| 11 | |||
| 12 | /*----------------------------------------------------------------*/ | ||
| 13 | |||
| 14 | /* | ||
| 15 | * Little inline functions that simplify calling the policy methods. | ||
| 16 | */ | ||
| 17 | static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock, | ||
| 18 | bool can_block, bool can_migrate, bool discarded_oblock, | ||
| 19 | struct bio *bio, struct policy_result *result) | ||
| 20 | { | ||
| 21 | return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, result); | ||
| 22 | } | ||
| 23 | |||
| 24 | static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) | ||
| 25 | { | ||
| 26 | BUG_ON(!p->lookup); | ||
| 27 | return p->lookup(p, oblock, cblock); | ||
| 28 | } | ||
| 29 | |||
| 30 | static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) | ||
| 31 | { | ||
| 32 | if (p->set_dirty) | ||
| 33 | p->set_dirty(p, oblock); | ||
| 34 | } | ||
| 35 | |||
| 36 | static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) | ||
| 37 | { | ||
| 38 | if (p->clear_dirty) | ||
| 39 | p->clear_dirty(p, oblock); | ||
| 40 | } | ||
| 41 | |||
| 42 | static inline int policy_load_mapping(struct dm_cache_policy *p, | ||
| 43 | dm_oblock_t oblock, dm_cblock_t cblock, | ||
| 44 | uint32_t hint, bool hint_valid) | ||
| 45 | { | ||
| 46 | return p->load_mapping(p, oblock, cblock, hint, hint_valid); | ||
| 47 | } | ||
| 48 | |||
| 49 | static inline int policy_walk_mappings(struct dm_cache_policy *p, | ||
| 50 | policy_walk_fn fn, void *context) | ||
| 51 | { | ||
| 52 | return p->walk_mappings ? p->walk_mappings(p, fn, context) : 0; | ||
| 53 | } | ||
| 54 | |||
| 55 | static inline int policy_writeback_work(struct dm_cache_policy *p, | ||
| 56 | dm_oblock_t *oblock, | ||
| 57 | dm_cblock_t *cblock) | ||
| 58 | { | ||
| 59 | return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT; | ||
| 60 | } | ||
| 61 | |||
| 62 | static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) | ||
| 63 | { | ||
| 64 | return p->remove_mapping(p, oblock); | ||
| 65 | } | ||
| 66 | |||
| 67 | static inline void policy_force_mapping(struct dm_cache_policy *p, | ||
| 68 | dm_oblock_t current_oblock, dm_oblock_t new_oblock) | ||
| 69 | { | ||
| 70 | return p->force_mapping(p, current_oblock, new_oblock); | ||
| 71 | } | ||
| 72 | |||
| 73 | static inline dm_cblock_t policy_residency(struct dm_cache_policy *p) | ||
| 74 | { | ||
| 75 | return p->residency(p); | ||
| 76 | } | ||
| 77 | |||
| 78 | static inline void policy_tick(struct dm_cache_policy *p) | ||
| 79 | { | ||
| 80 | if (p->tick) | ||
| 81 | return p->tick(p); | ||
| 82 | } | ||
| 83 | |||
| 84 | static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen) | ||
| 85 | { | ||
| 86 | ssize_t sz = 0; | ||
| 87 | if (p->emit_config_values) | ||
| 88 | return p->emit_config_values(p, result, maxlen); | ||
| 89 | |||
| 90 | DMEMIT("0"); | ||
| 91 | return 0; | ||
| 92 | } | ||
| 93 | |||
| 94 | static inline int policy_set_config_value(struct dm_cache_policy *p, | ||
| 95 | const char *key, const char *value) | ||
| 96 | { | ||
| 97 | return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL; | ||
| 98 | } | ||
| 99 | |||
| 100 | /*----------------------------------------------------------------*/ | ||
| 101 | |||
| 102 | /* | ||
| 103 | * Creates a new cache policy given a policy name, a cache size, an origin size and the block size. | ||
| 104 | */ | ||
| 105 | struct dm_cache_policy *dm_cache_policy_create(const char *name, dm_cblock_t cache_size, | ||
| 106 | sector_t origin_size, sector_t block_size); | ||
| 107 | |||
| 108 | /* | ||
| 109 | * Destroys the policy. This drops references to the policy module as well | ||
| 110 | * as calling it's destroy method. So always use this rather than calling | ||
| 111 | * the policy->destroy method directly. | ||
| 112 | */ | ||
| 113 | void dm_cache_policy_destroy(struct dm_cache_policy *p); | ||
| 114 | |||
| 115 | /* | ||
| 116 | * In case we've forgotten. | ||
| 117 | */ | ||
| 118 | const char *dm_cache_policy_get_name(struct dm_cache_policy *p); | ||
| 119 | |||
| 120 | size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p); | ||
| 121 | |||
| 122 | /*----------------------------------------------------------------*/ | ||
| 123 | |||
| 124 | #endif /* DM_CACHE_POLICY_INTERNAL_H */ | ||
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c new file mode 100644 index 000000000000..964153255076 --- /dev/null +++ b/drivers/md/dm-cache-policy-mq.c | |||
| @@ -0,0 +1,1195 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat. All rights reserved. | ||
| 3 | * | ||
| 4 | * This file is released under the GPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include "dm-cache-policy.h" | ||
| 8 | #include "dm.h" | ||
| 9 | |||
| 10 | #include <linux/hash.h> | ||
| 11 | #include <linux/module.h> | ||
| 12 | #include <linux/mutex.h> | ||
| 13 | #include <linux/slab.h> | ||
| 14 | #include <linux/vmalloc.h> | ||
| 15 | |||
| 16 | #define DM_MSG_PREFIX "cache-policy-mq" | ||
| 17 | #define MQ_VERSION "1.0.0" | ||
| 18 | |||
| 19 | static struct kmem_cache *mq_entry_cache; | ||
| 20 | |||
| 21 | /*----------------------------------------------------------------*/ | ||
| 22 | |||
| 23 | static unsigned next_power(unsigned n, unsigned min) | ||
| 24 | { | ||
| 25 | return roundup_pow_of_two(max(n, min)); | ||
| 26 | } | ||
| 27 | |||
| 28 | /*----------------------------------------------------------------*/ | ||
| 29 | |||
| 30 | static unsigned long *alloc_bitset(unsigned nr_entries) | ||
| 31 | { | ||
| 32 | size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); | ||
| 33 | return vzalloc(s); | ||
| 34 | } | ||
| 35 | |||
| 36 | static void free_bitset(unsigned long *bits) | ||
| 37 | { | ||
| 38 | vfree(bits); | ||
| 39 | } | ||
| 40 | |||
| 41 | /*----------------------------------------------------------------*/ | ||
| 42 | |||
| 43 | /* | ||
| 44 | * Large, sequential ios are probably better left on the origin device since | ||
| 45 | * spindles tend to have good bandwidth. | ||
| 46 | * | ||
| 47 | * The io_tracker tries to spot when the io is in one of these sequential | ||
| 48 | * modes. | ||
| 49 | * | ||
| 50 | * Two thresholds to switch between random and sequential io mode are defaulting | ||
| 51 | * as follows and can be adjusted via the constructor and message interfaces. | ||
| 52 | */ | ||
| 53 | #define RANDOM_THRESHOLD_DEFAULT 4 | ||
| 54 | #define SEQUENTIAL_THRESHOLD_DEFAULT 512 | ||
| 55 | |||
| 56 | enum io_pattern { | ||
| 57 | PATTERN_SEQUENTIAL, | ||
| 58 | PATTERN_RANDOM | ||
| 59 | }; | ||
| 60 | |||
| 61 | struct io_tracker { | ||
| 62 | enum io_pattern pattern; | ||
| 63 | |||
| 64 | unsigned nr_seq_samples; | ||
| 65 | unsigned nr_rand_samples; | ||
| 66 | unsigned thresholds[2]; | ||
| 67 | |||
| 68 | dm_oblock_t last_end_oblock; | ||
| 69 | }; | ||
| 70 | |||
| 71 | static void iot_init(struct io_tracker *t, | ||
| 72 | int sequential_threshold, int random_threshold) | ||
| 73 | { | ||
| 74 | t->pattern = PATTERN_RANDOM; | ||
| 75 | t->nr_seq_samples = 0; | ||
| 76 | t->nr_rand_samples = 0; | ||
| 77 | t->last_end_oblock = 0; | ||
| 78 | t->thresholds[PATTERN_RANDOM] = random_threshold; | ||
| 79 | t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold; | ||
| 80 | } | ||
| 81 | |||
| 82 | static enum io_pattern iot_pattern(struct io_tracker *t) | ||
| 83 | { | ||
| 84 | return t->pattern; | ||
| 85 | } | ||
| 86 | |||
| 87 | static void iot_update_stats(struct io_tracker *t, struct bio *bio) | ||
| 88 | { | ||
| 89 | if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1) | ||
| 90 | t->nr_seq_samples++; | ||
| 91 | else { | ||
| 92 | /* | ||
| 93 | * Just one non-sequential IO is enough to reset the | ||
| 94 | * counters. | ||
| 95 | */ | ||
| 96 | if (t->nr_seq_samples) { | ||
| 97 | t->nr_seq_samples = 0; | ||
| 98 | t->nr_rand_samples = 0; | ||
| 99 | } | ||
| 100 | |||
| 101 | t->nr_rand_samples++; | ||
| 102 | } | ||
| 103 | |||
| 104 | t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1); | ||
| 105 | } | ||
| 106 | |||
| 107 | static void iot_check_for_pattern_switch(struct io_tracker *t) | ||
| 108 | { | ||
| 109 | switch (t->pattern) { | ||
| 110 | case PATTERN_SEQUENTIAL: | ||
| 111 | if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) { | ||
| 112 | t->pattern = PATTERN_RANDOM; | ||
| 113 | t->nr_seq_samples = t->nr_rand_samples = 0; | ||
| 114 | } | ||
| 115 | break; | ||
| 116 | |||
| 117 | case PATTERN_RANDOM: | ||
| 118 | if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) { | ||
| 119 | t->pattern = PATTERN_SEQUENTIAL; | ||
| 120 | t->nr_seq_samples = t->nr_rand_samples = 0; | ||
| 121 | } | ||
| 122 | break; | ||
| 123 | } | ||
| 124 | } | ||
| 125 | |||
| 126 | static void iot_examine_bio(struct io_tracker *t, struct bio *bio) | ||
| 127 | { | ||
| 128 | iot_update_stats(t, bio); | ||
| 129 | iot_check_for_pattern_switch(t); | ||
| 130 | } | ||
| 131 | |||
| 132 | /*----------------------------------------------------------------*/ | ||
| 133 | |||
| 134 | |||
| 135 | /* | ||
| 136 | * This queue is divided up into different levels. Allowing us to push | ||
| 137 | * entries to the back of any of the levels. Think of it as a partially | ||
| 138 | * sorted queue. | ||
| 139 | */ | ||
| 140 | #define NR_QUEUE_LEVELS 16u | ||
| 141 | |||
| 142 | struct queue { | ||
| 143 | struct list_head qs[NR_QUEUE_LEVELS]; | ||
| 144 | }; | ||
| 145 | |||
| 146 | static void queue_init(struct queue *q) | ||
| 147 | { | ||
| 148 | unsigned i; | ||
| 149 | |||
| 150 | for (i = 0; i < NR_QUEUE_LEVELS; i++) | ||
| 151 | INIT_LIST_HEAD(q->qs + i); | ||
| 152 | } | ||
| 153 | |||
| 154 | /* | ||
| 155 | * Insert an entry to the back of the given level. | ||
| 156 | */ | ||
| 157 | static void queue_push(struct queue *q, unsigned level, struct list_head *elt) | ||
| 158 | { | ||
| 159 | list_add_tail(elt, q->qs + level); | ||
| 160 | } | ||
| 161 | |||
| 162 | static void queue_remove(struct list_head *elt) | ||
| 163 | { | ||
| 164 | list_del(elt); | ||
| 165 | } | ||
| 166 | |||
| 167 | /* | ||
| 168 | * Shifts all regions down one level. This has no effect on the order of | ||
| 169 | * the queue. | ||
| 170 | */ | ||
| 171 | static void queue_shift_down(struct queue *q) | ||
| 172 | { | ||
| 173 | unsigned level; | ||
| 174 | |||
| 175 | for (level = 1; level < NR_QUEUE_LEVELS; level++) | ||
| 176 | list_splice_init(q->qs + level, q->qs + level - 1); | ||
| 177 | } | ||
| 178 | |||
| 179 | /* | ||
| 180 | * Gives us the oldest entry of the lowest popoulated level. If the first | ||
| 181 | * level is emptied then we shift down one level. | ||
| 182 | */ | ||
| 183 | static struct list_head *queue_pop(struct queue *q) | ||
| 184 | { | ||
| 185 | unsigned level; | ||
| 186 | struct list_head *r; | ||
| 187 | |||
| 188 | for (level = 0; level < NR_QUEUE_LEVELS; level++) | ||
| 189 | if (!list_empty(q->qs + level)) { | ||
| 190 | r = q->qs[level].next; | ||
| 191 | list_del(r); | ||
| 192 | |||
| 193 | /* have we just emptied the bottom level? */ | ||
| 194 | if (level == 0 && list_empty(q->qs)) | ||
| 195 | queue_shift_down(q); | ||
| 196 | |||
| 197 | return r; | ||
| 198 | } | ||
| 199 | |||
| 200 | return NULL; | ||
| 201 | } | ||
| 202 | |||
| 203 | static struct list_head *list_pop(struct list_head *lh) | ||
| 204 | { | ||
| 205 | struct list_head *r = lh->next; | ||
| 206 | |||
| 207 | BUG_ON(!r); | ||
| 208 | list_del_init(r); | ||
| 209 | |||
| 210 | return r; | ||
| 211 | } | ||
| 212 | |||
| 213 | /*----------------------------------------------------------------*/ | ||
| 214 | |||
| 215 | /* | ||
| 216 | * Describes a cache entry. Used in both the cache and the pre_cache. | ||
| 217 | */ | ||
| 218 | struct entry { | ||
| 219 | struct hlist_node hlist; | ||
| 220 | struct list_head list; | ||
| 221 | dm_oblock_t oblock; | ||
| 222 | dm_cblock_t cblock; /* valid iff in_cache */ | ||
| 223 | |||
| 224 | /* | ||
| 225 | * FIXME: pack these better | ||
| 226 | */ | ||
| 227 | bool in_cache:1; | ||
| 228 | unsigned hit_count; | ||
| 229 | unsigned generation; | ||
| 230 | unsigned tick; | ||
| 231 | }; | ||
| 232 | |||
| 233 | struct mq_policy { | ||
| 234 | struct dm_cache_policy policy; | ||
| 235 | |||
| 236 | /* protects everything */ | ||
| 237 | struct mutex lock; | ||
| 238 | dm_cblock_t cache_size; | ||
| 239 | struct io_tracker tracker; | ||
| 240 | |||
| 241 | /* | ||
| 242 | * We maintain two queues of entries. The cache proper contains | ||
| 243 | * the currently active mappings. Whereas the pre_cache tracks | ||
| 244 | * blocks that are being hit frequently and potential candidates | ||
| 245 | * for promotion to the cache. | ||
| 246 | */ | ||
| 247 | struct queue pre_cache; | ||
| 248 | struct queue cache; | ||
| 249 | |||
| 250 | /* | ||
| 251 | * Keeps track of time, incremented by the core. We use this to | ||
| 252 | * avoid attributing multiple hits within the same tick. | ||
| 253 | * | ||
| 254 | * Access to tick_protected should be done with the spin lock held. | ||
| 255 | * It's copied to tick at the start of the map function (within the | ||
| 256 | * mutex). | ||
| 257 | */ | ||
| 258 | spinlock_t tick_lock; | ||
| 259 | unsigned tick_protected; | ||
| 260 | unsigned tick; | ||
| 261 | |||
| 262 | /* | ||
| 263 | * A count of the number of times the map function has been called | ||
| 264 | * and found an entry in the pre_cache or cache. Currently used to | ||
| 265 | * calculate the generation. | ||
| 266 | */ | ||
| 267 | unsigned hit_count; | ||
| 268 | |||
| 269 | /* | ||
| 270 | * A generation is a longish period that is used to trigger some | ||
| 271 | * book keeping effects. eg, decrementing hit counts on entries. | ||
| 272 | * This is needed to allow the cache to evolve as io patterns | ||
| 273 | * change. | ||
| 274 | */ | ||
| 275 | unsigned generation; | ||
| 276 | unsigned generation_period; /* in lookups (will probably change) */ | ||
| 277 | |||
| 278 | /* | ||
| 279 | * Entries in the pre_cache whose hit count passes the promotion | ||
| 280 | * threshold move to the cache proper. Working out the correct | ||
| 281 | * value for the promotion_threshold is crucial to this policy. | ||
| 282 | */ | ||
| 283 | unsigned promote_threshold; | ||
| 284 | |||
| 285 | /* | ||
| 286 | * We need cache_size entries for the cache, and choose to have | ||
| 287 | * cache_size entries for the pre_cache too. One motivation for | ||
| 288 | * using the same size is to make the hit counts directly | ||
| 289 | * comparable between pre_cache and cache. | ||
| 290 | */ | ||
| 291 | unsigned nr_entries; | ||
| 292 | unsigned nr_entries_allocated; | ||
| 293 | struct list_head free; | ||
| 294 | |||
| 295 | /* | ||
| 296 | * Cache blocks may be unallocated. We store this info in a | ||
| 297 | * bitset. | ||
| 298 | */ | ||
| 299 | unsigned long *allocation_bitset; | ||
| 300 | unsigned nr_cblocks_allocated; | ||
| 301 | unsigned find_free_nr_words; | ||
| 302 | unsigned find_free_last_word; | ||
| 303 | |||
| 304 | /* | ||
| 305 | * The hash table allows us to quickly find an entry by origin | ||
| 306 | * block. Both pre_cache and cache entries are in here. | ||
| 307 | */ | ||
| 308 | unsigned nr_buckets; | ||
| 309 | dm_block_t hash_bits; | ||
| 310 | struct hlist_head *table; | ||
| 311 | }; | ||
| 312 | |||
| 313 | /*----------------------------------------------------------------*/ | ||
| 314 | /* Free/alloc mq cache entry structures. */ | ||
| 315 | static void takeout_queue(struct list_head *lh, struct queue *q) | ||
| 316 | { | ||
| 317 | unsigned level; | ||
| 318 | |||
| 319 | for (level = 0; level < NR_QUEUE_LEVELS; level++) | ||
| 320 | list_splice(q->qs + level, lh); | ||
| 321 | } | ||
| 322 | |||
| 323 | static void free_entries(struct mq_policy *mq) | ||
| 324 | { | ||
| 325 | struct entry *e, *tmp; | ||
| 326 | |||
| 327 | takeout_queue(&mq->free, &mq->pre_cache); | ||
| 328 | takeout_queue(&mq->free, &mq->cache); | ||
| 329 | |||
| 330 | list_for_each_entry_safe(e, tmp, &mq->free, list) | ||
| 331 | kmem_cache_free(mq_entry_cache, e); | ||
| 332 | } | ||
| 333 | |||
| 334 | static int alloc_entries(struct mq_policy *mq, unsigned elts) | ||
| 335 | { | ||
| 336 | unsigned u = mq->nr_entries; | ||
| 337 | |||
| 338 | INIT_LIST_HEAD(&mq->free); | ||
| 339 | mq->nr_entries_allocated = 0; | ||
| 340 | |||
| 341 | while (u--) { | ||
| 342 | struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL); | ||
| 343 | |||
| 344 | if (!e) { | ||
| 345 | free_entries(mq); | ||
| 346 | return -ENOMEM; | ||
| 347 | } | ||
| 348 | |||
| 349 | |||
| 350 | list_add(&e->list, &mq->free); | ||
| 351 | } | ||
| 352 | |||
| 353 | return 0; | ||
| 354 | } | ||
| 355 | |||
| 356 | /*----------------------------------------------------------------*/ | ||
| 357 | |||
| 358 | /* | ||
| 359 | * Simple hash table implementation. Should replace with the standard hash | ||
| 360 | * table that's making its way upstream. | ||
| 361 | */ | ||
| 362 | static void hash_insert(struct mq_policy *mq, struct entry *e) | ||
| 363 | { | ||
| 364 | unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits); | ||
| 365 | |||
| 366 | hlist_add_head(&e->hlist, mq->table + h); | ||
| 367 | } | ||
| 368 | |||
| 369 | static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock) | ||
| 370 | { | ||
| 371 | unsigned h = hash_64(from_oblock(oblock), mq->hash_bits); | ||
| 372 | struct hlist_head *bucket = mq->table + h; | ||
| 373 | struct entry *e; | ||
| 374 | |||
| 375 | hlist_for_each_entry(e, bucket, hlist) | ||
| 376 | if (e->oblock == oblock) { | ||
| 377 | hlist_del(&e->hlist); | ||
| 378 | hlist_add_head(&e->hlist, bucket); | ||
| 379 | return e; | ||
| 380 | } | ||
| 381 | |||
| 382 | return NULL; | ||
| 383 | } | ||
| 384 | |||
| 385 | static void hash_remove(struct entry *e) | ||
| 386 | { | ||
| 387 | hlist_del(&e->hlist); | ||
| 388 | } | ||
| 389 | |||
| 390 | /*----------------------------------------------------------------*/ | ||
| 391 | |||
| 392 | /* | ||
| 393 | * Allocates a new entry structure. The memory is allocated in one lump, | ||
| 394 | * so we just handing it out here. Returns NULL if all entries have | ||
| 395 | * already been allocated. Cannot fail otherwise. | ||
| 396 | */ | ||
| 397 | static struct entry *alloc_entry(struct mq_policy *mq) | ||
| 398 | { | ||
| 399 | struct entry *e; | ||
| 400 | |||
| 401 | if (mq->nr_entries_allocated >= mq->nr_entries) { | ||
| 402 | BUG_ON(!list_empty(&mq->free)); | ||
| 403 | return NULL; | ||
| 404 | } | ||
| 405 | |||
| 406 | e = list_entry(list_pop(&mq->free), struct entry, list); | ||
| 407 | INIT_LIST_HEAD(&e->list); | ||
| 408 | INIT_HLIST_NODE(&e->hlist); | ||
| 409 | |||
| 410 | mq->nr_entries_allocated++; | ||
| 411 | return e; | ||
| 412 | } | ||
| 413 | |||
| 414 | /*----------------------------------------------------------------*/ | ||
| 415 | |||
| 416 | /* | ||
| 417 | * Mark cache blocks allocated or not in the bitset. | ||
| 418 | */ | ||
| 419 | static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock) | ||
| 420 | { | ||
| 421 | BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size)); | ||
| 422 | BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset)); | ||
| 423 | |||
| 424 | set_bit(from_cblock(cblock), mq->allocation_bitset); | ||
| 425 | mq->nr_cblocks_allocated++; | ||
| 426 | } | ||
| 427 | |||
| 428 | static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock) | ||
| 429 | { | ||
| 430 | BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size)); | ||
| 431 | BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset)); | ||
| 432 | |||
| 433 | clear_bit(from_cblock(cblock), mq->allocation_bitset); | ||
| 434 | mq->nr_cblocks_allocated--; | ||
| 435 | } | ||
| 436 | |||
| 437 | static bool any_free_cblocks(struct mq_policy *mq) | ||
| 438 | { | ||
| 439 | return mq->nr_cblocks_allocated < from_cblock(mq->cache_size); | ||
| 440 | } | ||
| 441 | |||
| 442 | /* | ||
| 443 | * Fills result out with a cache block that isn't in use, or return | ||
| 444 | * -ENOSPC. This does _not_ mark the cblock as allocated, the caller is | ||
| 445 | * reponsible for that. | ||
| 446 | */ | ||
| 447 | static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end, | ||
| 448 | dm_cblock_t *result, unsigned *last_word) | ||
| 449 | { | ||
| 450 | int r = -ENOSPC; | ||
| 451 | unsigned w; | ||
| 452 | |||
| 453 | for (w = begin; w < end; w++) { | ||
| 454 | /* | ||
| 455 | * ffz is undefined if no zero exists | ||
| 456 | */ | ||
| 457 | if (mq->allocation_bitset[w] != ~0UL) { | ||
| 458 | *last_word = w; | ||
| 459 | *result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w])); | ||
| 460 | if (from_cblock(*result) < from_cblock(mq->cache_size)) | ||
| 461 | r = 0; | ||
| 462 | |||
| 463 | break; | ||
| 464 | } | ||
| 465 | } | ||
| 466 | |||
| 467 | return r; | ||
| 468 | } | ||
| 469 | |||
| 470 | static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result) | ||
| 471 | { | ||
| 472 | int r; | ||
| 473 | |||
| 474 | if (!any_free_cblocks(mq)) | ||
| 475 | return -ENOSPC; | ||
| 476 | |||
| 477 | r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word); | ||
| 478 | if (r == -ENOSPC && mq->find_free_last_word) | ||
| 479 | r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word); | ||
| 480 | |||
| 481 | return r; | ||
| 482 | } | ||
| 483 | |||
| 484 | /*----------------------------------------------------------------*/ | ||
| 485 | |||
| 486 | /* | ||
| 487 | * Now we get to the meat of the policy. This section deals with deciding | ||
| 488 | * when to to add entries to the pre_cache and cache, and move between | ||
| 489 | * them. | ||
| 490 | */ | ||
| 491 | |||
| 492 | /* | ||
| 493 | * The queue level is based on the log2 of the hit count. | ||
| 494 | */ | ||
| 495 | static unsigned queue_level(struct entry *e) | ||
| 496 | { | ||
| 497 | return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); | ||
| 498 | } | ||
| 499 | |||
| 500 | /* | ||
| 501 | * Inserts the entry into the pre_cache or the cache. Ensures the cache | ||
| 502 | * block is marked as allocated if necc. Inserts into the hash table. Sets the | ||
| 503 | * tick which records when the entry was last moved about. | ||
| 504 | */ | ||
| 505 | static void push(struct mq_policy *mq, struct entry *e) | ||
| 506 | { | ||
| 507 | e->tick = mq->tick; | ||
| 508 | hash_insert(mq, e); | ||
| 509 | |||
| 510 | if (e->in_cache) { | ||
| 511 | alloc_cblock(mq, e->cblock); | ||
| 512 | queue_push(&mq->cache, queue_level(e), &e->list); | ||
| 513 | } else | ||
| 514 | queue_push(&mq->pre_cache, queue_level(e), &e->list); | ||
| 515 | } | ||
| 516 | |||
| 517 | /* | ||
| 518 | * Removes an entry from pre_cache or cache. Removes from the hash table. | ||
| 519 | * Frees off the cache block if necc. | ||
| 520 | */ | ||
| 521 | static void del(struct mq_policy *mq, struct entry *e) | ||
| 522 | { | ||
| 523 | queue_remove(&e->list); | ||
| 524 | hash_remove(e); | ||
| 525 | if (e->in_cache) | ||
| 526 | free_cblock(mq, e->cblock); | ||
| 527 | } | ||
| 528 | |||
| 529 | /* | ||
| 530 | * Like del, except it removes the first entry in the queue (ie. the least | ||
| 531 | * recently used). | ||
| 532 | */ | ||
| 533 | static struct entry *pop(struct mq_policy *mq, struct queue *q) | ||
| 534 | { | ||
| 535 | struct entry *e = container_of(queue_pop(q), struct entry, list); | ||
| 536 | |||
| 537 | if (e) { | ||
| 538 | hash_remove(e); | ||
| 539 | |||
| 540 | if (e->in_cache) | ||
| 541 | free_cblock(mq, e->cblock); | ||
| 542 | } | ||
| 543 | |||
| 544 | return e; | ||
| 545 | } | ||
| 546 | |||
| 547 | /* | ||
| 548 | * Has this entry already been updated? | ||
| 549 | */ | ||
| 550 | static bool updated_this_tick(struct mq_policy *mq, struct entry *e) | ||
| 551 | { | ||
| 552 | return mq->tick == e->tick; | ||
| 553 | } | ||
| 554 | |||
| 555 | /* | ||
| 556 | * The promotion threshold is adjusted every generation. As are the counts | ||
| 557 | * of the entries. | ||
| 558 | * | ||
| 559 | * At the moment the threshold is taken by averaging the hit counts of some | ||
| 560 | * of the entries in the cache (the first 20 entries of the first level). | ||
| 561 | * | ||
| 562 | * We can be much cleverer than this though. For example, each promotion | ||
| 563 | * could bump up the threshold helping to prevent churn. Much more to do | ||
| 564 | * here. | ||
| 565 | */ | ||
| 566 | |||
| 567 | #define MAX_TO_AVERAGE 20 | ||
| 568 | |||
| 569 | static void check_generation(struct mq_policy *mq) | ||
| 570 | { | ||
| 571 | unsigned total = 0, nr = 0, count = 0, level; | ||
| 572 | struct list_head *head; | ||
| 573 | struct entry *e; | ||
| 574 | |||
| 575 | if ((mq->hit_count >= mq->generation_period) && | ||
| 576 | (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) { | ||
| 577 | |||
| 578 | mq->hit_count = 0; | ||
| 579 | mq->generation++; | ||
| 580 | |||
| 581 | for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { | ||
| 582 | head = mq->cache.qs + level; | ||
| 583 | list_for_each_entry(e, head, list) { | ||
| 584 | nr++; | ||
| 585 | total += e->hit_count; | ||
| 586 | |||
| 587 | if (++count >= MAX_TO_AVERAGE) | ||
| 588 | break; | ||
| 589 | } | ||
| 590 | } | ||
| 591 | |||
| 592 | mq->promote_threshold = nr ? total / nr : 1; | ||
| 593 | if (mq->promote_threshold * nr < total) | ||
| 594 | mq->promote_threshold++; | ||
| 595 | } | ||
| 596 | } | ||
| 597 | |||
| 598 | /* | ||
| 599 | * Whenever we use an entry we bump up it's hit counter, and push it to the | ||
| 600 | * back to it's current level. | ||
| 601 | */ | ||
| 602 | static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e) | ||
| 603 | { | ||
| 604 | if (updated_this_tick(mq, e)) | ||
| 605 | return; | ||
| 606 | |||
| 607 | e->hit_count++; | ||
| 608 | mq->hit_count++; | ||
| 609 | check_generation(mq); | ||
| 610 | |||
| 611 | /* generation adjustment, to stop the counts increasing forever. */ | ||
| 612 | /* FIXME: divide? */ | ||
| 613 | /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */ | ||
| 614 | e->generation = mq->generation; | ||
| 615 | |||
| 616 | del(mq, e); | ||
| 617 | push(mq, e); | ||
| 618 | } | ||
| 619 | |||
| 620 | /* | ||
| 621 | * Demote the least recently used entry from the cache to the pre_cache. | ||
| 622 | * Returns the new cache entry to use, and the old origin block it was | ||
| 623 | * mapped to. | ||
| 624 | * | ||
| 625 | * We drop the hit count on the demoted entry back to 1 to stop it bouncing | ||
| 626 | * straight back into the cache if it's subsequently hit. There are | ||
| 627 | * various options here, and more experimentation would be good: | ||
| 628 | * | ||
| 629 | * - just forget about the demoted entry completely (ie. don't insert it | ||
| 630 | into the pre_cache). | ||
| 631 | * - divide the hit count rather that setting to some hard coded value. | ||
| 632 | * - set the hit count to a hard coded value other than 1, eg, is it better | ||
| 633 | * if it goes in at level 2? | ||
| 634 | */ | ||
| 635 | static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) | ||
| 636 | { | ||
| 637 | dm_cblock_t result; | ||
| 638 | struct entry *demoted = pop(mq, &mq->cache); | ||
| 639 | |||
| 640 | BUG_ON(!demoted); | ||
| 641 | result = demoted->cblock; | ||
| 642 | *oblock = demoted->oblock; | ||
| 643 | demoted->in_cache = false; | ||
| 644 | demoted->hit_count = 1; | ||
| 645 | push(mq, demoted); | ||
| 646 | |||
| 647 | return result; | ||
| 648 | } | ||
| 649 | |||
| 650 | /* | ||
| 651 | * We modify the basic promotion_threshold depending on the specific io. | ||
| 652 | * | ||
| 653 | * If the origin block has been discarded then there's no cost to copy it | ||
| 654 | * to the cache. | ||
| 655 | * | ||
| 656 | * We bias towards reads, since they can be demoted at no cost if they | ||
| 657 | * haven't been dirtied. | ||
| 658 | */ | ||
| 659 | #define DISCARDED_PROMOTE_THRESHOLD 1 | ||
| 660 | #define READ_PROMOTE_THRESHOLD 4 | ||
| 661 | #define WRITE_PROMOTE_THRESHOLD 8 | ||
| 662 | |||
| 663 | static unsigned adjusted_promote_threshold(struct mq_policy *mq, | ||
| 664 | bool discarded_oblock, int data_dir) | ||
| 665 | { | ||
| 666 | if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE) | ||
| 667 | /* | ||
| 668 | * We don't need to do any copying at all, so give this a | ||
| 669 | * very low threshold. In practice this only triggers | ||
| 670 | * during initial population after a format. | ||
| 671 | */ | ||
| 672 | return DISCARDED_PROMOTE_THRESHOLD; | ||
| 673 | |||
| 674 | return data_dir == READ ? | ||
| 675 | (mq->promote_threshold + READ_PROMOTE_THRESHOLD) : | ||
| 676 | (mq->promote_threshold + WRITE_PROMOTE_THRESHOLD); | ||
| 677 | } | ||
| 678 | |||
| 679 | static bool should_promote(struct mq_policy *mq, struct entry *e, | ||
| 680 | bool discarded_oblock, int data_dir) | ||
| 681 | { | ||
| 682 | return e->hit_count >= | ||
| 683 | adjusted_promote_threshold(mq, discarded_oblock, data_dir); | ||
| 684 | } | ||
| 685 | |||
| 686 | static int cache_entry_found(struct mq_policy *mq, | ||
| 687 | struct entry *e, | ||
| 688 | struct policy_result *result) | ||
| 689 | { | ||
| 690 | requeue_and_update_tick(mq, e); | ||
| 691 | |||
| 692 | if (e->in_cache) { | ||
| 693 | result->op = POLICY_HIT; | ||
| 694 | result->cblock = e->cblock; | ||
| 695 | } | ||
| 696 | |||
| 697 | return 0; | ||
| 698 | } | ||
| 699 | |||
| 700 | /* | ||
| 701 | * Moves and entry from the pre_cache to the cache. The main work is | ||
| 702 | * finding which cache block to use. | ||
| 703 | */ | ||
| 704 | static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, | ||
| 705 | struct policy_result *result) | ||
| 706 | { | ||
| 707 | dm_cblock_t cblock; | ||
| 708 | |||
| 709 | if (find_free_cblock(mq, &cblock) == -ENOSPC) { | ||
| 710 | result->op = POLICY_REPLACE; | ||
| 711 | cblock = demote_cblock(mq, &result->old_oblock); | ||
| 712 | } else | ||
| 713 | result->op = POLICY_NEW; | ||
| 714 | |||
| 715 | result->cblock = e->cblock = cblock; | ||
| 716 | |||
| 717 | del(mq, e); | ||
| 718 | e->in_cache = true; | ||
| 719 | push(mq, e); | ||
| 720 | |||
| 721 | return 0; | ||
| 722 | } | ||
| 723 | |||
| 724 | static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, | ||
| 725 | bool can_migrate, bool discarded_oblock, | ||
| 726 | int data_dir, struct policy_result *result) | ||
| 727 | { | ||
| 728 | int r = 0; | ||
| 729 | bool updated = updated_this_tick(mq, e); | ||
| 730 | |||
| 731 | requeue_and_update_tick(mq, e); | ||
| 732 | |||
| 733 | if ((!discarded_oblock && updated) || | ||
| 734 | !should_promote(mq, e, discarded_oblock, data_dir)) | ||
| 735 | result->op = POLICY_MISS; | ||
| 736 | else if (!can_migrate) | ||
| 737 | r = -EWOULDBLOCK; | ||
| 738 | else | ||
| 739 | r = pre_cache_to_cache(mq, e, result); | ||
| 740 | |||
| 741 | return r; | ||
| 742 | } | ||
| 743 | |||
| 744 | static void insert_in_pre_cache(struct mq_policy *mq, | ||
| 745 | dm_oblock_t oblock) | ||
| 746 | { | ||
| 747 | struct entry *e = alloc_entry(mq); | ||
| 748 | |||
| 749 | if (!e) | ||
| 750 | /* | ||
| 751 | * There's no spare entry structure, so we grab the least | ||
| 752 | * used one from the pre_cache. | ||
| 753 | */ | ||
| 754 | e = pop(mq, &mq->pre_cache); | ||
| 755 | |||
| 756 | if (unlikely(!e)) { | ||
| 757 | DMWARN("couldn't pop from pre cache"); | ||
| 758 | return; | ||
| 759 | } | ||
| 760 | |||
| 761 | e->in_cache = false; | ||
| 762 | e->oblock = oblock; | ||
| 763 | e->hit_count = 1; | ||
| 764 | e->generation = mq->generation; | ||
| 765 | push(mq, e); | ||
| 766 | } | ||
| 767 | |||
| 768 | static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, | ||
| 769 | struct policy_result *result) | ||
| 770 | { | ||
| 771 | struct entry *e; | ||
| 772 | dm_cblock_t cblock; | ||
| 773 | |||
| 774 | if (find_free_cblock(mq, &cblock) == -ENOSPC) { | ||
| 775 | result->op = POLICY_MISS; | ||
| 776 | insert_in_pre_cache(mq, oblock); | ||
| 777 | return; | ||
| 778 | } | ||
| 779 | |||
| 780 | e = alloc_entry(mq); | ||
| 781 | if (unlikely(!e)) { | ||
| 782 | result->op = POLICY_MISS; | ||
| 783 | return; | ||
| 784 | } | ||
| 785 | |||
| 786 | e->oblock = oblock; | ||
| 787 | e->cblock = cblock; | ||
| 788 | e->in_cache = true; | ||
| 789 | e->hit_count = 1; | ||
| 790 | e->generation = mq->generation; | ||
| 791 | push(mq, e); | ||
| 792 | |||
| 793 | result->op = POLICY_NEW; | ||
| 794 | result->cblock = e->cblock; | ||
| 795 | } | ||
| 796 | |||
| 797 | static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, | ||
| 798 | bool can_migrate, bool discarded_oblock, | ||
| 799 | int data_dir, struct policy_result *result) | ||
| 800 | { | ||
| 801 | if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) { | ||
| 802 | if (can_migrate) | ||
| 803 | insert_in_cache(mq, oblock, result); | ||
| 804 | else | ||
| 805 | return -EWOULDBLOCK; | ||
| 806 | } else { | ||
| 807 | insert_in_pre_cache(mq, oblock); | ||
| 808 | result->op = POLICY_MISS; | ||
| 809 | } | ||
| 810 | |||
| 811 | return 0; | ||
| 812 | } | ||
| 813 | |||
| 814 | /* | ||
| 815 | * Looks the oblock up in the hash table, then decides whether to put in | ||
| 816 | * pre_cache, or cache etc. | ||
| 817 | */ | ||
| 818 | static int map(struct mq_policy *mq, dm_oblock_t oblock, | ||
| 819 | bool can_migrate, bool discarded_oblock, | ||
| 820 | int data_dir, struct policy_result *result) | ||
| 821 | { | ||
| 822 | int r = 0; | ||
| 823 | struct entry *e = hash_lookup(mq, oblock); | ||
| 824 | |||
| 825 | if (e && e->in_cache) | ||
| 826 | r = cache_entry_found(mq, e, result); | ||
| 827 | else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) | ||
| 828 | result->op = POLICY_MISS; | ||
| 829 | else if (e) | ||
| 830 | r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, | ||
| 831 | data_dir, result); | ||
| 832 | else | ||
| 833 | r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, | ||
| 834 | data_dir, result); | ||
| 835 | |||
| 836 | if (r == -EWOULDBLOCK) | ||
| 837 | result->op = POLICY_MISS; | ||
| 838 | |||
| 839 | return r; | ||
| 840 | } | ||
| 841 | |||
| 842 | /*----------------------------------------------------------------*/ | ||
| 843 | |||
| 844 | /* | ||
| 845 | * Public interface, via the policy struct. See dm-cache-policy.h for a | ||
| 846 | * description of these. | ||
| 847 | */ | ||
| 848 | |||
| 849 | static struct mq_policy *to_mq_policy(struct dm_cache_policy *p) | ||
| 850 | { | ||
| 851 | return container_of(p, struct mq_policy, policy); | ||
| 852 | } | ||
| 853 | |||
| 854 | static void mq_destroy(struct dm_cache_policy *p) | ||
| 855 | { | ||
| 856 | struct mq_policy *mq = to_mq_policy(p); | ||
| 857 | |||
| 858 | free_bitset(mq->allocation_bitset); | ||
| 859 | kfree(mq->table); | ||
| 860 | free_entries(mq); | ||
| 861 | kfree(mq); | ||
| 862 | } | ||
| 863 | |||
| 864 | static void copy_tick(struct mq_policy *mq) | ||
| 865 | { | ||
| 866 | unsigned long flags; | ||
| 867 | |||
| 868 | spin_lock_irqsave(&mq->tick_lock, flags); | ||
| 869 | mq->tick = mq->tick_protected; | ||
| 870 | spin_unlock_irqrestore(&mq->tick_lock, flags); | ||
| 871 | } | ||
| 872 | |||
| 873 | static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock, | ||
| 874 | bool can_block, bool can_migrate, bool discarded_oblock, | ||
| 875 | struct bio *bio, struct policy_result *result) | ||
| 876 | { | ||
| 877 | int r; | ||
| 878 | struct mq_policy *mq = to_mq_policy(p); | ||
| 879 | |||
| 880 | result->op = POLICY_MISS; | ||
| 881 | |||
| 882 | if (can_block) | ||
| 883 | mutex_lock(&mq->lock); | ||
| 884 | else if (!mutex_trylock(&mq->lock)) | ||
| 885 | return -EWOULDBLOCK; | ||
| 886 | |||
| 887 | copy_tick(mq); | ||
| 888 | |||
| 889 | iot_examine_bio(&mq->tracker, bio); | ||
| 890 | r = map(mq, oblock, can_migrate, discarded_oblock, | ||
| 891 | bio_data_dir(bio), result); | ||
| 892 | |||
| 893 | mutex_unlock(&mq->lock); | ||
| 894 | |||
| 895 | return r; | ||
| 896 | } | ||
| 897 | |||
| 898 | static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) | ||
| 899 | { | ||
| 900 | int r; | ||
| 901 | struct mq_policy *mq = to_mq_policy(p); | ||
| 902 | struct entry *e; | ||
| 903 | |||
| 904 | if (!mutex_trylock(&mq->lock)) | ||
| 905 | return -EWOULDBLOCK; | ||
| 906 | |||
| 907 | e = hash_lookup(mq, oblock); | ||
| 908 | if (e && e->in_cache) { | ||
| 909 | *cblock = e->cblock; | ||
| 910 | r = 0; | ||
| 911 | } else | ||
| 912 | r = -ENOENT; | ||
| 913 | |||
| 914 | mutex_unlock(&mq->lock); | ||
| 915 | |||
| 916 | return r; | ||
| 917 | } | ||
| 918 | |||
| 919 | static int mq_load_mapping(struct dm_cache_policy *p, | ||
| 920 | dm_oblock_t oblock, dm_cblock_t cblock, | ||
| 921 | uint32_t hint, bool hint_valid) | ||
| 922 | { | ||
| 923 | struct mq_policy *mq = to_mq_policy(p); | ||
| 924 | struct entry *e; | ||
| 925 | |||
| 926 | e = alloc_entry(mq); | ||
| 927 | if (!e) | ||
| 928 | return -ENOMEM; | ||
| 929 | |||
| 930 | e->cblock = cblock; | ||
| 931 | e->oblock = oblock; | ||
| 932 | e->in_cache = true; | ||
| 933 | e->hit_count = hint_valid ? hint : 1; | ||
| 934 | e->generation = mq->generation; | ||
| 935 | push(mq, e); | ||
| 936 | |||
| 937 | return 0; | ||
| 938 | } | ||
| 939 | |||
| 940 | static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, | ||
| 941 | void *context) | ||
| 942 | { | ||
| 943 | struct mq_policy *mq = to_mq_policy(p); | ||
| 944 | int r = 0; | ||
| 945 | struct entry *e; | ||
| 946 | unsigned level; | ||
| 947 | |||
| 948 | mutex_lock(&mq->lock); | ||
| 949 | |||
| 950 | for (level = 0; level < NR_QUEUE_LEVELS; level++) | ||
| 951 | list_for_each_entry(e, &mq->cache.qs[level], list) { | ||
| 952 | r = fn(context, e->cblock, e->oblock, e->hit_count); | ||
| 953 | if (r) | ||
| 954 | goto out; | ||
| 955 | } | ||
| 956 | |||
| 957 | out: | ||
| 958 | mutex_unlock(&mq->lock); | ||
| 959 | |||
| 960 | return r; | ||
| 961 | } | ||
| 962 | |||
| 963 | static void remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) | ||
| 964 | { | ||
| 965 | struct entry *e = hash_lookup(mq, oblock); | ||
| 966 | |||
| 967 | BUG_ON(!e || !e->in_cache); | ||
| 968 | |||
| 969 | del(mq, e); | ||
| 970 | e->in_cache = false; | ||
| 971 | push(mq, e); | ||
| 972 | } | ||
| 973 | |||
| 974 | static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) | ||
| 975 | { | ||
| 976 | struct mq_policy *mq = to_mq_policy(p); | ||
| 977 | |||
| 978 | mutex_lock(&mq->lock); | ||
| 979 | remove_mapping(mq, oblock); | ||
| 980 | mutex_unlock(&mq->lock); | ||
| 981 | } | ||
| 982 | |||
| 983 | static void force_mapping(struct mq_policy *mq, | ||
| 984 | dm_oblock_t current_oblock, dm_oblock_t new_oblock) | ||
| 985 | { | ||
| 986 | struct entry *e = hash_lookup(mq, current_oblock); | ||
| 987 | |||
| 988 | BUG_ON(!e || !e->in_cache); | ||
| 989 | |||
| 990 | del(mq, e); | ||
| 991 | e->oblock = new_oblock; | ||
| 992 | push(mq, e); | ||
| 993 | } | ||
| 994 | |||
| 995 | static void mq_force_mapping(struct dm_cache_policy *p, | ||
| 996 | dm_oblock_t current_oblock, dm_oblock_t new_oblock) | ||
| 997 | { | ||
| 998 | struct mq_policy *mq = to_mq_policy(p); | ||
| 999 | |||
| 1000 | mutex_lock(&mq->lock); | ||
| 1001 | force_mapping(mq, current_oblock, new_oblock); | ||
| 1002 | mutex_unlock(&mq->lock); | ||
| 1003 | } | ||
| 1004 | |||
| 1005 | static dm_cblock_t mq_residency(struct dm_cache_policy *p) | ||
| 1006 | { | ||
| 1007 | struct mq_policy *mq = to_mq_policy(p); | ||
| 1008 | |||
| 1009 | /* FIXME: lock mutex, not sure we can block here */ | ||
| 1010 | return to_cblock(mq->nr_cblocks_allocated); | ||
| 1011 | } | ||
| 1012 | |||
| 1013 | static void mq_tick(struct dm_cache_policy *p) | ||
| 1014 | { | ||
| 1015 | struct mq_policy *mq = to_mq_policy(p); | ||
| 1016 | unsigned long flags; | ||
| 1017 | |||
| 1018 | spin_lock_irqsave(&mq->tick_lock, flags); | ||
| 1019 | mq->tick_protected++; | ||
| 1020 | spin_unlock_irqrestore(&mq->tick_lock, flags); | ||
| 1021 | } | ||
| 1022 | |||
| 1023 | static int mq_set_config_value(struct dm_cache_policy *p, | ||
| 1024 | const char *key, const char *value) | ||
| 1025 | { | ||
| 1026 | struct mq_policy *mq = to_mq_policy(p); | ||
| 1027 | enum io_pattern pattern; | ||
| 1028 | unsigned long tmp; | ||
| 1029 | |||
| 1030 | if (!strcasecmp(key, "random_threshold")) | ||
| 1031 | pattern = PATTERN_RANDOM; | ||
| 1032 | else if (!strcasecmp(key, "sequential_threshold")) | ||
| 1033 | pattern = PATTERN_SEQUENTIAL; | ||
| 1034 | else | ||
| 1035 | return -EINVAL; | ||
| 1036 | |||
| 1037 | if (kstrtoul(value, 10, &tmp)) | ||
| 1038 | return -EINVAL; | ||
| 1039 | |||
| 1040 | mq->tracker.thresholds[pattern] = tmp; | ||
| 1041 | |||
| 1042 | return 0; | ||
| 1043 | } | ||
| 1044 | |||
| 1045 | static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen) | ||
| 1046 | { | ||
| 1047 | ssize_t sz = 0; | ||
| 1048 | struct mq_policy *mq = to_mq_policy(p); | ||
| 1049 | |||
| 1050 | DMEMIT("4 random_threshold %u sequential_threshold %u", | ||
| 1051 | mq->tracker.thresholds[PATTERN_RANDOM], | ||
| 1052 | mq->tracker.thresholds[PATTERN_SEQUENTIAL]); | ||
| 1053 | |||
| 1054 | return 0; | ||
| 1055 | } | ||
| 1056 | |||
| 1057 | /* Init the policy plugin interface function pointers. */ | ||
| 1058 | static void init_policy_functions(struct mq_policy *mq) | ||
| 1059 | { | ||
| 1060 | mq->policy.destroy = mq_destroy; | ||
| 1061 | mq->policy.map = mq_map; | ||
| 1062 | mq->policy.lookup = mq_lookup; | ||
| 1063 | mq->policy.load_mapping = mq_load_mapping; | ||
| 1064 | mq->policy.walk_mappings = mq_walk_mappings; | ||
| 1065 | mq->policy.remove_mapping = mq_remove_mapping; | ||
| 1066 | mq->policy.writeback_work = NULL; | ||
| 1067 | mq->policy.force_mapping = mq_force_mapping; | ||
| 1068 | mq->policy.residency = mq_residency; | ||
| 1069 | mq->policy.tick = mq_tick; | ||
| 1070 | mq->policy.emit_config_values = mq_emit_config_values; | ||
| 1071 | mq->policy.set_config_value = mq_set_config_value; | ||
| 1072 | } | ||
| 1073 | |||
| 1074 | static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, | ||
| 1075 | sector_t origin_size, | ||
| 1076 | sector_t cache_block_size) | ||
| 1077 | { | ||
| 1078 | int r; | ||
| 1079 | struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); | ||
| 1080 | |||
| 1081 | if (!mq) | ||
| 1082 | return NULL; | ||
| 1083 | |||
| 1084 | init_policy_functions(mq); | ||
| 1085 | iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT); | ||
| 1086 | |||
| 1087 | mq->cache_size = cache_size; | ||
| 1088 | mq->tick_protected = 0; | ||
| 1089 | mq->tick = 0; | ||
| 1090 | mq->hit_count = 0; | ||
| 1091 | mq->generation = 0; | ||
| 1092 | mq->promote_threshold = 0; | ||
| 1093 | mutex_init(&mq->lock); | ||
| 1094 | spin_lock_init(&mq->tick_lock); | ||
| 1095 | mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG); | ||
| 1096 | mq->find_free_last_word = 0; | ||
| 1097 | |||
| 1098 | queue_init(&mq->pre_cache); | ||
| 1099 | queue_init(&mq->cache); | ||
| 1100 | mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); | ||
| 1101 | |||
| 1102 | mq->nr_entries = 2 * from_cblock(cache_size); | ||
| 1103 | r = alloc_entries(mq, mq->nr_entries); | ||
| 1104 | if (r) | ||
| 1105 | goto bad_cache_alloc; | ||
| 1106 | |||
| 1107 | mq->nr_entries_allocated = 0; | ||
| 1108 | mq->nr_cblocks_allocated = 0; | ||
| 1109 | |||
| 1110 | mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); | ||
| 1111 | mq->hash_bits = ffs(mq->nr_buckets) - 1; | ||
| 1112 | mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL); | ||
| 1113 | if (!mq->table) | ||
| 1114 | goto bad_alloc_table; | ||
| 1115 | |||
| 1116 | mq->allocation_bitset = alloc_bitset(from_cblock(cache_size)); | ||
| 1117 | if (!mq->allocation_bitset) | ||
| 1118 | goto bad_alloc_bitset; | ||
| 1119 | |||
| 1120 | return &mq->policy; | ||
| 1121 | |||
| 1122 | bad_alloc_bitset: | ||
| 1123 | kfree(mq->table); | ||
| 1124 | bad_alloc_table: | ||
| 1125 | free_entries(mq); | ||
| 1126 | bad_cache_alloc: | ||
| 1127 | kfree(mq); | ||
| 1128 | |||
| 1129 | return NULL; | ||
| 1130 | } | ||
| 1131 | |||
| 1132 | /*----------------------------------------------------------------*/ | ||
| 1133 | |||
| 1134 | static struct dm_cache_policy_type mq_policy_type = { | ||
| 1135 | .name = "mq", | ||
| 1136 | .hint_size = 4, | ||
| 1137 | .owner = THIS_MODULE, | ||
| 1138 | .create = mq_create | ||
| 1139 | }; | ||
| 1140 | |||
| 1141 | static struct dm_cache_policy_type default_policy_type = { | ||
| 1142 | .name = "default", | ||
| 1143 | .hint_size = 4, | ||
| 1144 | .owner = THIS_MODULE, | ||
| 1145 | .create = mq_create | ||
| 1146 | }; | ||
| 1147 | |||
| 1148 | static int __init mq_init(void) | ||
| 1149 | { | ||
| 1150 | int r; | ||
| 1151 | |||
| 1152 | mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry", | ||
| 1153 | sizeof(struct entry), | ||
| 1154 | __alignof__(struct entry), | ||
| 1155 | 0, NULL); | ||
| 1156 | if (!mq_entry_cache) | ||
| 1157 | goto bad; | ||
| 1158 | |||
| 1159 | r = dm_cache_policy_register(&mq_policy_type); | ||
| 1160 | if (r) { | ||
| 1161 | DMERR("register failed %d", r); | ||
| 1162 | goto bad_register_mq; | ||
| 1163 | } | ||
| 1164 | |||
| 1165 | r = dm_cache_policy_register(&default_policy_type); | ||
| 1166 | if (!r) { | ||
| 1167 | DMINFO("version " MQ_VERSION " loaded"); | ||
| 1168 | return 0; | ||
| 1169 | } | ||
| 1170 | |||
| 1171 | DMERR("register failed (as default) %d", r); | ||
| 1172 | |||
| 1173 | dm_cache_policy_unregister(&mq_policy_type); | ||
| 1174 | bad_register_mq: | ||
| 1175 | kmem_cache_destroy(mq_entry_cache); | ||
| 1176 | bad: | ||
| 1177 | return -ENOMEM; | ||
| 1178 | } | ||
| 1179 | |||
| 1180 | static void __exit mq_exit(void) | ||
| 1181 | { | ||
| 1182 | dm_cache_policy_unregister(&mq_policy_type); | ||
| 1183 | dm_cache_policy_unregister(&default_policy_type); | ||
| 1184 | |||
| 1185 | kmem_cache_destroy(mq_entry_cache); | ||
| 1186 | } | ||
| 1187 | |||
| 1188 | module_init(mq_init); | ||
| 1189 | module_exit(mq_exit); | ||
| 1190 | |||
| 1191 | MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); | ||
| 1192 | MODULE_LICENSE("GPL"); | ||
| 1193 | MODULE_DESCRIPTION("mq cache policy"); | ||
| 1194 | |||
| 1195 | MODULE_ALIAS("dm-cache-default"); | ||
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c new file mode 100644 index 000000000000..2cbf5fdaac52 --- /dev/null +++ b/drivers/md/dm-cache-policy.c | |||
| @@ -0,0 +1,161 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat. All rights reserved. | ||
| 3 | * | ||
| 4 | * This file is released under the GPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include "dm-cache-policy-internal.h" | ||
| 8 | #include "dm.h" | ||
| 9 | |||
| 10 | #include <linux/module.h> | ||
| 11 | #include <linux/slab.h> | ||
| 12 | |||
| 13 | /*----------------------------------------------------------------*/ | ||
| 14 | |||
| 15 | #define DM_MSG_PREFIX "cache-policy" | ||
| 16 | |||
| 17 | static DEFINE_SPINLOCK(register_lock); | ||
| 18 | static LIST_HEAD(register_list); | ||
| 19 | |||
| 20 | static struct dm_cache_policy_type *__find_policy(const char *name) | ||
| 21 | { | ||
| 22 | struct dm_cache_policy_type *t; | ||
| 23 | |||
| 24 | list_for_each_entry(t, ®ister_list, list) | ||
| 25 | if (!strcmp(t->name, name)) | ||
| 26 | return t; | ||
| 27 | |||
| 28 | return NULL; | ||
| 29 | } | ||
| 30 | |||
| 31 | static struct dm_cache_policy_type *__get_policy_once(const char *name) | ||
| 32 | { | ||
| 33 | struct dm_cache_policy_type *t = __find_policy(name); | ||
| 34 | |||
| 35 | if (t && !try_module_get(t->owner)) { | ||
| 36 | DMWARN("couldn't get module %s", name); | ||
| 37 | t = ERR_PTR(-EINVAL); | ||
| 38 | } | ||
| 39 | |||
| 40 | return t; | ||
| 41 | } | ||
| 42 | |||
| 43 | static struct dm_cache_policy_type *get_policy_once(const char *name) | ||
| 44 | { | ||
| 45 | struct dm_cache_policy_type *t; | ||
| 46 | |||
| 47 | spin_lock(®ister_lock); | ||
| 48 | t = __get_policy_once(name); | ||
| 49 | spin_unlock(®ister_lock); | ||
| 50 | |||
| 51 | return t; | ||
| 52 | } | ||
| 53 | |||
| 54 | static struct dm_cache_policy_type *get_policy(const char *name) | ||
| 55 | { | ||
| 56 | struct dm_cache_policy_type *t; | ||
| 57 | |||
| 58 | t = get_policy_once(name); | ||
| 59 | if (IS_ERR(t)) | ||
| 60 | return NULL; | ||
| 61 | |||
| 62 | if (t) | ||
| 63 | return t; | ||
| 64 | |||
| 65 | request_module("dm-cache-%s", name); | ||
| 66 | |||
| 67 | t = get_policy_once(name); | ||
| 68 | if (IS_ERR(t)) | ||
| 69 | return NULL; | ||
| 70 | |||
| 71 | return t; | ||
| 72 | } | ||
| 73 | |||
| 74 | static void put_policy(struct dm_cache_policy_type *t) | ||
| 75 | { | ||
| 76 | module_put(t->owner); | ||
| 77 | } | ||
| 78 | |||
| 79 | int dm_cache_policy_register(struct dm_cache_policy_type *type) | ||
| 80 | { | ||
| 81 | int r; | ||
| 82 | |||
| 83 | /* One size fits all for now */ | ||
| 84 | if (type->hint_size != 0 && type->hint_size != 4) { | ||
| 85 | DMWARN("hint size must be 0 or 4 but %llu supplied.", (unsigned long long) type->hint_size); | ||
| 86 | return -EINVAL; | ||
| 87 | } | ||
| 88 | |||
| 89 | spin_lock(®ister_lock); | ||
| 90 | if (__find_policy(type->name)) { | ||
| 91 | DMWARN("attempt to register policy under duplicate name %s", type->name); | ||
| 92 | r = -EINVAL; | ||
| 93 | } else { | ||
| 94 | list_add(&type->list, ®ister_list); | ||
| 95 | r = 0; | ||
| 96 | } | ||
| 97 | spin_unlock(®ister_lock); | ||
| 98 | |||
| 99 | return r; | ||
| 100 | } | ||
| 101 | EXPORT_SYMBOL_GPL(dm_cache_policy_register); | ||
| 102 | |||
| 103 | void dm_cache_policy_unregister(struct dm_cache_policy_type *type) | ||
| 104 | { | ||
| 105 | spin_lock(®ister_lock); | ||
| 106 | list_del_init(&type->list); | ||
| 107 | spin_unlock(®ister_lock); | ||
| 108 | } | ||
| 109 | EXPORT_SYMBOL_GPL(dm_cache_policy_unregister); | ||
| 110 | |||
| 111 | struct dm_cache_policy *dm_cache_policy_create(const char *name, | ||
| 112 | dm_cblock_t cache_size, | ||
| 113 | sector_t origin_size, | ||
| 114 | sector_t cache_block_size) | ||
| 115 | { | ||
| 116 | struct dm_cache_policy *p = NULL; | ||
| 117 | struct dm_cache_policy_type *type; | ||
| 118 | |||
| 119 | type = get_policy(name); | ||
| 120 | if (!type) { | ||
| 121 | DMWARN("unknown policy type"); | ||
| 122 | return NULL; | ||
| 123 | } | ||
| 124 | |||
| 125 | p = type->create(cache_size, origin_size, cache_block_size); | ||
| 126 | if (!p) { | ||
| 127 | put_policy(type); | ||
| 128 | return NULL; | ||
| 129 | } | ||
| 130 | p->private = type; | ||
| 131 | |||
| 132 | return p; | ||
| 133 | } | ||
| 134 | EXPORT_SYMBOL_GPL(dm_cache_policy_create); | ||
| 135 | |||
| 136 | void dm_cache_policy_destroy(struct dm_cache_policy *p) | ||
| 137 | { | ||
| 138 | struct dm_cache_policy_type *t = p->private; | ||
| 139 | |||
| 140 | p->destroy(p); | ||
| 141 | put_policy(t); | ||
| 142 | } | ||
| 143 | EXPORT_SYMBOL_GPL(dm_cache_policy_destroy); | ||
| 144 | |||
| 145 | const char *dm_cache_policy_get_name(struct dm_cache_policy *p) | ||
| 146 | { | ||
| 147 | struct dm_cache_policy_type *t = p->private; | ||
| 148 | |||
| 149 | return t->name; | ||
| 150 | } | ||
| 151 | EXPORT_SYMBOL_GPL(dm_cache_policy_get_name); | ||
| 152 | |||
| 153 | size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p) | ||
| 154 | { | ||
| 155 | struct dm_cache_policy_type *t = p->private; | ||
| 156 | |||
| 157 | return t->hint_size; | ||
| 158 | } | ||
| 159 | EXPORT_SYMBOL_GPL(dm_cache_policy_get_hint_size); | ||
| 160 | |||
| 161 | /*----------------------------------------------------------------*/ | ||
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h new file mode 100644 index 000000000000..f0f51b260544 --- /dev/null +++ b/drivers/md/dm-cache-policy.h | |||
| @@ -0,0 +1,228 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat. All rights reserved. | ||
| 3 | * | ||
| 4 | * This file is released under the GPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #ifndef DM_CACHE_POLICY_H | ||
| 8 | #define DM_CACHE_POLICY_H | ||
| 9 | |||
| 10 | #include "dm-cache-block-types.h" | ||
| 11 | |||
| 12 | #include <linux/device-mapper.h> | ||
| 13 | |||
| 14 | /*----------------------------------------------------------------*/ | ||
| 15 | |||
| 16 | /* FIXME: make it clear which methods are optional. Get debug policy to | ||
| 17 | * double check this at start. | ||
| 18 | */ | ||
| 19 | |||
| 20 | /* | ||
| 21 | * The cache policy makes the important decisions about which blocks get to | ||
| 22 | * live on the faster cache device. | ||
| 23 | * | ||
| 24 | * When the core target has to remap a bio it calls the 'map' method of the | ||
| 25 | * policy. This returns an instruction telling the core target what to do. | ||
| 26 | * | ||
| 27 | * POLICY_HIT: | ||
| 28 | * That block is in the cache. Remap to the cache and carry on. | ||
| 29 | * | ||
| 30 | * POLICY_MISS: | ||
| 31 | * This block is on the origin device. Remap and carry on. | ||
| 32 | * | ||
| 33 | * POLICY_NEW: | ||
| 34 | * This block is currently on the origin device, but the policy wants to | ||
| 35 | * move it. The core should: | ||
| 36 | * | ||
| 37 | * - hold any further io to this origin block | ||
| 38 | * - copy the origin to the given cache block | ||
| 39 | * - release all the held blocks | ||
| 40 | * - remap the original block to the cache | ||
| 41 | * | ||
| 42 | * POLICY_REPLACE: | ||
| 43 | * This block is currently on the origin device. The policy wants to | ||
| 44 | * move it to the cache, with the added complication that the destination | ||
| 45 | * cache block needs a writeback first. The core should: | ||
| 46 | * | ||
| 47 | * - hold any further io to this origin block | ||
| 48 | * - hold any further io to the origin block that's being written back | ||
| 49 | * - writeback | ||
| 50 | * - copy new block to cache | ||
| 51 | * - release held blocks | ||
| 52 | * - remap bio to cache and reissue. | ||
| 53 | * | ||
| 54 | * Should the core run into trouble while processing a POLICY_NEW or | ||
| 55 | * POLICY_REPLACE instruction it will roll back the policies mapping using | ||
| 56 | * remove_mapping() or force_mapping(). These methods must not fail. This | ||
| 57 | * approach avoids having transactional semantics in the policy (ie, the | ||
| 58 | * core informing the policy when a migration is complete), and hence makes | ||
| 59 | * it easier to write new policies. | ||
| 60 | * | ||
| 61 | * In general policy methods should never block, except in the case of the | ||
| 62 | * map function when can_migrate is set. So be careful to implement using | ||
| 63 | * bounded, preallocated memory. | ||
| 64 | */ | ||
| 65 | enum policy_operation { | ||
| 66 | POLICY_HIT, | ||
| 67 | POLICY_MISS, | ||
| 68 | POLICY_NEW, | ||
| 69 | POLICY_REPLACE | ||
| 70 | }; | ||
| 71 | |||
| 72 | /* | ||
| 73 | * This is the instruction passed back to the core target. | ||
| 74 | */ | ||
| 75 | struct policy_result { | ||
| 76 | enum policy_operation op; | ||
| 77 | dm_oblock_t old_oblock; /* POLICY_REPLACE */ | ||
| 78 | dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */ | ||
| 79 | }; | ||
| 80 | |||
| 81 | typedef int (*policy_walk_fn)(void *context, dm_cblock_t cblock, | ||
| 82 | dm_oblock_t oblock, uint32_t hint); | ||
| 83 | |||
| 84 | /* | ||
| 85 | * The cache policy object. Just a bunch of methods. It is envisaged that | ||
| 86 | * this structure will be embedded in a bigger, policy specific structure | ||
| 87 | * (ie. use container_of()). | ||
| 88 | */ | ||
| 89 | struct dm_cache_policy { | ||
| 90 | |||
| 91 | /* | ||
| 92 | * FIXME: make it clear which methods are optional, and which may | ||
| 93 | * block. | ||
| 94 | */ | ||
| 95 | |||
| 96 | /* | ||
| 97 | * Destroys this object. | ||
| 98 | */ | ||
| 99 | void (*destroy)(struct dm_cache_policy *p); | ||
| 100 | |||
| 101 | /* | ||
| 102 | * See large comment above. | ||
| 103 | * | ||
| 104 | * oblock - the origin block we're interested in. | ||
| 105 | * | ||
| 106 | * can_block - indicates whether the current thread is allowed to | ||
| 107 | * block. -EWOULDBLOCK returned if it can't and would. | ||
| 108 | * | ||
| 109 | * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE | ||
| 110 | * instructions. If denied and the policy would have | ||
| 111 | * returned one of these instructions it should | ||
| 112 | * return -EWOULDBLOCK. | ||
| 113 | * | ||
| 114 | * discarded_oblock - indicates whether the whole origin block is | ||
| 115 | * in a discarded state (FIXME: better to tell the | ||
| 116 | * policy about this sooner, so it can recycle that | ||
| 117 | * cache block if it wants.) | ||
| 118 | * bio - the bio that triggered this call. | ||
| 119 | * result - gets filled in with the instruction. | ||
| 120 | * | ||
| 121 | * May only return 0, or -EWOULDBLOCK (if !can_migrate) | ||
| 122 | */ | ||
| 123 | int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock, | ||
| 124 | bool can_block, bool can_migrate, bool discarded_oblock, | ||
| 125 | struct bio *bio, struct policy_result *result); | ||
| 126 | |||
| 127 | /* | ||
| 128 | * Sometimes we want to see if a block is in the cache, without | ||
| 129 | * triggering any update of stats. (ie. it's not a real hit). | ||
| 130 | * | ||
| 131 | * Must not block. | ||
| 132 | * | ||
| 133 | * Returns 1 iff in cache, 0 iff not, < 0 on error (-EWOULDBLOCK | ||
| 134 | * would be typical). | ||
| 135 | */ | ||
| 136 | int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); | ||
| 137 | |||
| 138 | /* | ||
| 139 | * oblock must be a mapped block. Must not block. | ||
| 140 | */ | ||
| 141 | void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); | ||
| 142 | void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); | ||
| 143 | |||
| 144 | /* | ||
| 145 | * Called when a cache target is first created. Used to load a | ||
| 146 | * mapping from the metadata device into the policy. | ||
| 147 | */ | ||
| 148 | int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock, | ||
| 149 | dm_cblock_t cblock, uint32_t hint, bool hint_valid); | ||
| 150 | |||
| 151 | int (*walk_mappings)(struct dm_cache_policy *p, policy_walk_fn fn, | ||
| 152 | void *context); | ||
| 153 | |||
| 154 | /* | ||
| 155 | * Override functions used on the error paths of the core target. | ||
| 156 | * They must succeed. | ||
| 157 | */ | ||
| 158 | void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock); | ||
| 159 | void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock, | ||
| 160 | dm_oblock_t new_oblock); | ||
| 161 | |||
| 162 | int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock); | ||
| 163 | |||
| 164 | |||
| 165 | /* | ||
| 166 | * How full is the cache? | ||
| 167 | */ | ||
| 168 | dm_cblock_t (*residency)(struct dm_cache_policy *p); | ||
| 169 | |||
| 170 | /* | ||
| 171 | * Because of where we sit in the block layer, we can be asked to | ||
| 172 | * map a lot of little bios that are all in the same block (no | ||
| 173 | * queue merging has occurred). To stop the policy being fooled by | ||
| 174 | * these the core target sends regular tick() calls to the policy. | ||
| 175 | * The policy should only count an entry as hit once per tick. | ||
| 176 | */ | ||
| 177 | void (*tick)(struct dm_cache_policy *p); | ||
| 178 | |||
| 179 | /* | ||
| 180 | * Configuration. | ||
| 181 | */ | ||
| 182 | int (*emit_config_values)(struct dm_cache_policy *p, | ||
| 183 | char *result, unsigned maxlen); | ||
| 184 | int (*set_config_value)(struct dm_cache_policy *p, | ||
| 185 | const char *key, const char *value); | ||
| 186 | |||
| 187 | /* | ||
| 188 | * Book keeping ptr for the policy register, not for general use. | ||
| 189 | */ | ||
| 190 | void *private; | ||
| 191 | }; | ||
| 192 | |||
| 193 | /*----------------------------------------------------------------*/ | ||
| 194 | |||
| 195 | /* | ||
| 196 | * We maintain a little register of the different policy types. | ||
| 197 | */ | ||
| 198 | #define CACHE_POLICY_NAME_SIZE 16 | ||
| 199 | |||
| 200 | struct dm_cache_policy_type { | ||
| 201 | /* For use by the register code only. */ | ||
| 202 | struct list_head list; | ||
| 203 | |||
| 204 | /* | ||
| 205 | * Policy writers should fill in these fields. The name field is | ||
| 206 | * what gets passed on the target line to select your policy. | ||
| 207 | */ | ||
| 208 | char name[CACHE_POLICY_NAME_SIZE]; | ||
| 209 | |||
| 210 | /* | ||
| 211 | * Policies may store a hint for each each cache block. | ||
| 212 | * Currently the size of this hint must be 0 or 4 bytes but we | ||
| 213 | * expect to relax this in future. | ||
| 214 | */ | ||
| 215 | size_t hint_size; | ||
| 216 | |||
| 217 | struct module *owner; | ||
| 218 | struct dm_cache_policy *(*create)(dm_cblock_t cache_size, | ||
| 219 | sector_t origin_size, | ||
| 220 | sector_t block_size); | ||
| 221 | }; | ||
| 222 | |||
| 223 | int dm_cache_policy_register(struct dm_cache_policy_type *type); | ||
| 224 | void dm_cache_policy_unregister(struct dm_cache_policy_type *type); | ||
| 225 | |||
| 226 | /*----------------------------------------------------------------*/ | ||
| 227 | |||
| 228 | #endif /* DM_CACHE_POLICY_H */ | ||
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c new file mode 100644 index 000000000000..0f4e84b15c30 --- /dev/null +++ b/drivers/md/dm-cache-target.c | |||
| @@ -0,0 +1,2584 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat. All rights reserved. | ||
| 3 | * | ||
| 4 | * This file is released under the GPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include "dm.h" | ||
| 8 | #include "dm-bio-prison.h" | ||
| 9 | #include "dm-cache-metadata.h" | ||
| 10 | |||
| 11 | #include <linux/dm-io.h> | ||
| 12 | #include <linux/dm-kcopyd.h> | ||
| 13 | #include <linux/init.h> | ||
| 14 | #include <linux/mempool.h> | ||
| 15 | #include <linux/module.h> | ||
| 16 | #include <linux/slab.h> | ||
| 17 | #include <linux/vmalloc.h> | ||
| 18 | |||
| 19 | #define DM_MSG_PREFIX "cache" | ||
| 20 | |||
| 21 | DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, | ||
| 22 | "A percentage of time allocated for copying to and/or from cache"); | ||
| 23 | |||
| 24 | /*----------------------------------------------------------------*/ | ||
| 25 | |||
| 26 | /* | ||
| 27 | * Glossary: | ||
| 28 | * | ||
| 29 | * oblock: index of an origin block | ||
| 30 | * cblock: index of a cache block | ||
| 31 | * promotion: movement of a block from origin to cache | ||
| 32 | * demotion: movement of a block from cache to origin | ||
| 33 | * migration: movement of a block between the origin and cache device, | ||
| 34 | * either direction | ||
| 35 | */ | ||
| 36 | |||
| 37 | /*----------------------------------------------------------------*/ | ||
| 38 | |||
| 39 | static size_t bitset_size_in_bytes(unsigned nr_entries) | ||
| 40 | { | ||
| 41 | return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); | ||
| 42 | } | ||
| 43 | |||
| 44 | static unsigned long *alloc_bitset(unsigned nr_entries) | ||
| 45 | { | ||
| 46 | size_t s = bitset_size_in_bytes(nr_entries); | ||
| 47 | return vzalloc(s); | ||
| 48 | } | ||
| 49 | |||
| 50 | static void clear_bitset(void *bitset, unsigned nr_entries) | ||
| 51 | { | ||
| 52 | size_t s = bitset_size_in_bytes(nr_entries); | ||
| 53 | memset(bitset, 0, s); | ||
| 54 | } | ||
| 55 | |||
| 56 | static void free_bitset(unsigned long *bits) | ||
| 57 | { | ||
| 58 | vfree(bits); | ||
| 59 | } | ||
| 60 | |||
| 61 | /*----------------------------------------------------------------*/ | ||
| 62 | |||
| 63 | #define PRISON_CELLS 1024 | ||
| 64 | #define MIGRATION_POOL_SIZE 128 | ||
| 65 | #define COMMIT_PERIOD HZ | ||
| 66 | #define MIGRATION_COUNT_WINDOW 10 | ||
| 67 | |||
| 68 | /* | ||
| 69 | * The block size of the device holding cache data must be >= 32KB | ||
| 70 | */ | ||
| 71 | #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) | ||
| 72 | |||
| 73 | /* | ||
| 74 | * FIXME: the cache is read/write for the time being. | ||
| 75 | */ | ||
| 76 | enum cache_mode { | ||
| 77 | CM_WRITE, /* metadata may be changed */ | ||
| 78 | CM_READ_ONLY, /* metadata may not be changed */ | ||
| 79 | }; | ||
| 80 | |||
| 81 | struct cache_features { | ||
| 82 | enum cache_mode mode; | ||
| 83 | bool write_through:1; | ||
| 84 | }; | ||
| 85 | |||
| 86 | struct cache_stats { | ||
| 87 | atomic_t read_hit; | ||
| 88 | atomic_t read_miss; | ||
| 89 | atomic_t write_hit; | ||
| 90 | atomic_t write_miss; | ||
| 91 | atomic_t demotion; | ||
| 92 | atomic_t promotion; | ||
| 93 | atomic_t copies_avoided; | ||
| 94 | atomic_t cache_cell_clash; | ||
| 95 | atomic_t commit_count; | ||
| 96 | atomic_t discard_count; | ||
| 97 | }; | ||
| 98 | |||
| 99 | struct cache { | ||
| 100 | struct dm_target *ti; | ||
| 101 | struct dm_target_callbacks callbacks; | ||
| 102 | |||
| 103 | /* | ||
| 104 | * Metadata is written to this device. | ||
| 105 | */ | ||
| 106 | struct dm_dev *metadata_dev; | ||
| 107 | |||
| 108 | /* | ||
| 109 | * The slower of the two data devices. Typically a spindle. | ||
| 110 | */ | ||
| 111 | struct dm_dev *origin_dev; | ||
| 112 | |||
| 113 | /* | ||
| 114 | * The faster of the two data devices. Typically an SSD. | ||
| 115 | */ | ||
| 116 | struct dm_dev *cache_dev; | ||
| 117 | |||
| 118 | /* | ||
| 119 | * Cache features such as write-through. | ||
| 120 | */ | ||
| 121 | struct cache_features features; | ||
| 122 | |||
| 123 | /* | ||
| 124 | * Size of the origin device in _complete_ blocks and native sectors. | ||
| 125 | */ | ||
| 126 | dm_oblock_t origin_blocks; | ||
| 127 | sector_t origin_sectors; | ||
| 128 | |||
| 129 | /* | ||
| 130 | * Size of the cache device in blocks. | ||
| 131 | */ | ||
| 132 | dm_cblock_t cache_size; | ||
| 133 | |||
| 134 | /* | ||
| 135 | * Fields for converting from sectors to blocks. | ||
| 136 | */ | ||
| 137 | uint32_t sectors_per_block; | ||
| 138 | int sectors_per_block_shift; | ||
| 139 | |||
| 140 | struct dm_cache_metadata *cmd; | ||
| 141 | |||
| 142 | spinlock_t lock; | ||
| 143 | struct bio_list deferred_bios; | ||
| 144 | struct bio_list deferred_flush_bios; | ||
| 145 | struct list_head quiesced_migrations; | ||
| 146 | struct list_head completed_migrations; | ||
| 147 | struct list_head need_commit_migrations; | ||
| 148 | sector_t migration_threshold; | ||
| 149 | atomic_t nr_migrations; | ||
| 150 | wait_queue_head_t migration_wait; | ||
| 151 | |||
| 152 | /* | ||
| 153 | * cache_size entries, dirty if set | ||
| 154 | */ | ||
| 155 | dm_cblock_t nr_dirty; | ||
| 156 | unsigned long *dirty_bitset; | ||
| 157 | |||
| 158 | /* | ||
| 159 | * origin_blocks entries, discarded if set. | ||
| 160 | */ | ||
| 161 | sector_t discard_block_size; /* a power of 2 times sectors per block */ | ||
| 162 | dm_dblock_t discard_nr_blocks; | ||
| 163 | unsigned long *discard_bitset; | ||
| 164 | |||
| 165 | struct dm_kcopyd_client *copier; | ||
| 166 | struct workqueue_struct *wq; | ||
| 167 | struct work_struct worker; | ||
| 168 | |||
| 169 | struct delayed_work waker; | ||
| 170 | unsigned long last_commit_jiffies; | ||
| 171 | |||
| 172 | struct dm_bio_prison *prison; | ||
| 173 | struct dm_deferred_set *all_io_ds; | ||
| 174 | |||
| 175 | mempool_t *migration_pool; | ||
| 176 | struct dm_cache_migration *next_migration; | ||
| 177 | |||
| 178 | struct dm_cache_policy *policy; | ||
| 179 | unsigned policy_nr_args; | ||
| 180 | |||
| 181 | bool need_tick_bio:1; | ||
| 182 | bool sized:1; | ||
| 183 | bool quiescing:1; | ||
| 184 | bool commit_requested:1; | ||
| 185 | bool loaded_mappings:1; | ||
| 186 | bool loaded_discards:1; | ||
| 187 | |||
| 188 | struct cache_stats stats; | ||
| 189 | |||
| 190 | /* | ||
| 191 | * Rather than reconstructing the table line for the status we just | ||
| 192 | * save it and regurgitate. | ||
| 193 | */ | ||
| 194 | unsigned nr_ctr_args; | ||
| 195 | const char **ctr_args; | ||
| 196 | }; | ||
| 197 | |||
| 198 | struct per_bio_data { | ||
| 199 | bool tick:1; | ||
| 200 | unsigned req_nr:2; | ||
| 201 | struct dm_deferred_entry *all_io_entry; | ||
| 202 | }; | ||
| 203 | |||
| 204 | struct dm_cache_migration { | ||
| 205 | struct list_head list; | ||
| 206 | struct cache *cache; | ||
| 207 | |||
| 208 | unsigned long start_jiffies; | ||
| 209 | dm_oblock_t old_oblock; | ||
| 210 | dm_oblock_t new_oblock; | ||
| 211 | dm_cblock_t cblock; | ||
| 212 | |||
| 213 | bool err:1; | ||
| 214 | bool writeback:1; | ||
| 215 | bool demote:1; | ||
| 216 | bool promote:1; | ||
| 217 | |||
| 218 | struct dm_bio_prison_cell *old_ocell; | ||
| 219 | struct dm_bio_prison_cell *new_ocell; | ||
| 220 | }; | ||
| 221 | |||
| 222 | /* | ||
| 223 | * Processing a bio in the worker thread may require these memory | ||
| 224 | * allocations. We prealloc to avoid deadlocks (the same worker thread | ||
| 225 | * frees them back to the mempool). | ||
| 226 | */ | ||
| 227 | struct prealloc { | ||
| 228 | struct dm_cache_migration *mg; | ||
| 229 | struct dm_bio_prison_cell *cell1; | ||
| 230 | struct dm_bio_prison_cell *cell2; | ||
| 231 | }; | ||
| 232 | |||
| 233 | static void wake_worker(struct cache *cache) | ||
| 234 | { | ||
| 235 | queue_work(cache->wq, &cache->worker); | ||
| 236 | } | ||
| 237 | |||
| 238 | /*----------------------------------------------------------------*/ | ||
| 239 | |||
| 240 | static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) | ||
| 241 | { | ||
| 242 | /* FIXME: change to use a local slab. */ | ||
| 243 | return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); | ||
| 244 | } | ||
| 245 | |||
| 246 | static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) | ||
| 247 | { | ||
| 248 | dm_bio_prison_free_cell(cache->prison, cell); | ||
| 249 | } | ||
| 250 | |||
| 251 | static int prealloc_data_structs(struct cache *cache, struct prealloc *p) | ||
| 252 | { | ||
| 253 | if (!p->mg) { | ||
| 254 | p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); | ||
| 255 | if (!p->mg) | ||
| 256 | return -ENOMEM; | ||
| 257 | } | ||
| 258 | |||
| 259 | if (!p->cell1) { | ||
| 260 | p->cell1 = alloc_prison_cell(cache); | ||
| 261 | if (!p->cell1) | ||
| 262 | return -ENOMEM; | ||
| 263 | } | ||
| 264 | |||
| 265 | if (!p->cell2) { | ||
| 266 | p->cell2 = alloc_prison_cell(cache); | ||
| 267 | if (!p->cell2) | ||
| 268 | return -ENOMEM; | ||
| 269 | } | ||
| 270 | |||
| 271 | return 0; | ||
| 272 | } | ||
| 273 | |||
| 274 | static void prealloc_free_structs(struct cache *cache, struct prealloc *p) | ||
| 275 | { | ||
| 276 | if (p->cell2) | ||
| 277 | free_prison_cell(cache, p->cell2); | ||
| 278 | |||
| 279 | if (p->cell1) | ||
| 280 | free_prison_cell(cache, p->cell1); | ||
| 281 | |||
| 282 | if (p->mg) | ||
| 283 | mempool_free(p->mg, cache->migration_pool); | ||
| 284 | } | ||
| 285 | |||
| 286 | static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) | ||
| 287 | { | ||
| 288 | struct dm_cache_migration *mg = p->mg; | ||
| 289 | |||
| 290 | BUG_ON(!mg); | ||
| 291 | p->mg = NULL; | ||
| 292 | |||
| 293 | return mg; | ||
| 294 | } | ||
| 295 | |||
| 296 | /* | ||
| 297 | * You must have a cell within the prealloc struct to return. If not this | ||
| 298 | * function will BUG() rather than returning NULL. | ||
| 299 | */ | ||
| 300 | static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) | ||
| 301 | { | ||
| 302 | struct dm_bio_prison_cell *r = NULL; | ||
| 303 | |||
| 304 | if (p->cell1) { | ||
| 305 | r = p->cell1; | ||
| 306 | p->cell1 = NULL; | ||
| 307 | |||
| 308 | } else if (p->cell2) { | ||
| 309 | r = p->cell2; | ||
| 310 | p->cell2 = NULL; | ||
| 311 | } else | ||
| 312 | BUG(); | ||
| 313 | |||
| 314 | return r; | ||
| 315 | } | ||
| 316 | |||
| 317 | /* | ||
| 318 | * You can't have more than two cells in a prealloc struct. BUG() will be | ||
| 319 | * called if you try and overfill. | ||
| 320 | */ | ||
| 321 | static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) | ||
| 322 | { | ||
| 323 | if (!p->cell2) | ||
| 324 | p->cell2 = cell; | ||
| 325 | |||
| 326 | else if (!p->cell1) | ||
| 327 | p->cell1 = cell; | ||
| 328 | |||
| 329 | else | ||
| 330 | BUG(); | ||
| 331 | } | ||
| 332 | |||
| 333 | /*----------------------------------------------------------------*/ | ||
| 334 | |||
| 335 | static void build_key(dm_oblock_t oblock, struct dm_cell_key *key) | ||
| 336 | { | ||
| 337 | key->virtual = 0; | ||
| 338 | key->dev = 0; | ||
| 339 | key->block = from_oblock(oblock); | ||
| 340 | } | ||
| 341 | |||
| 342 | /* | ||
| 343 | * The caller hands in a preallocated cell, and a free function for it. | ||
| 344 | * The cell will be freed if there's an error, or if it wasn't used because | ||
| 345 | * a cell with that key already exists. | ||
| 346 | */ | ||
| 347 | typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); | ||
| 348 | |||
| 349 | static int bio_detain(struct cache *cache, dm_oblock_t oblock, | ||
| 350 | struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, | ||
| 351 | cell_free_fn free_fn, void *free_context, | ||
| 352 | struct dm_bio_prison_cell **cell_result) | ||
| 353 | { | ||
| 354 | int r; | ||
| 355 | struct dm_cell_key key; | ||
| 356 | |||
| 357 | build_key(oblock, &key); | ||
| 358 | r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); | ||
| 359 | if (r) | ||
| 360 | free_fn(free_context, cell_prealloc); | ||
| 361 | |||
| 362 | return r; | ||
| 363 | } | ||
| 364 | |||
| 365 | static int get_cell(struct cache *cache, | ||
| 366 | dm_oblock_t oblock, | ||
| 367 | struct prealloc *structs, | ||
| 368 | struct dm_bio_prison_cell **cell_result) | ||
| 369 | { | ||
| 370 | int r; | ||
| 371 | struct dm_cell_key key; | ||
| 372 | struct dm_bio_prison_cell *cell_prealloc; | ||
| 373 | |||
| 374 | cell_prealloc = prealloc_get_cell(structs); | ||
| 375 | |||
| 376 | build_key(oblock, &key); | ||
| 377 | r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); | ||
| 378 | if (r) | ||
| 379 | prealloc_put_cell(structs, cell_prealloc); | ||
| 380 | |||
| 381 | return r; | ||
| 382 | } | ||
| 383 | |||
| 384 | /*----------------------------------------------------------------*/ | ||
| 385 | |||
| 386 | static bool is_dirty(struct cache *cache, dm_cblock_t b) | ||
| 387 | { | ||
| 388 | return test_bit(from_cblock(b), cache->dirty_bitset); | ||
| 389 | } | ||
| 390 | |||
| 391 | static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) | ||
| 392 | { | ||
| 393 | if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { | ||
| 394 | cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1); | ||
| 395 | policy_set_dirty(cache->policy, oblock); | ||
| 396 | } | ||
| 397 | } | ||
| 398 | |||
| 399 | static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) | ||
| 400 | { | ||
| 401 | if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { | ||
| 402 | policy_clear_dirty(cache->policy, oblock); | ||
| 403 | cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1); | ||
| 404 | if (!from_cblock(cache->nr_dirty)) | ||
| 405 | dm_table_event(cache->ti->table); | ||
| 406 | } | ||
| 407 | } | ||
| 408 | |||
| 409 | /*----------------------------------------------------------------*/ | ||
| 410 | static bool block_size_is_power_of_two(struct cache *cache) | ||
| 411 | { | ||
| 412 | return cache->sectors_per_block_shift >= 0; | ||
| 413 | } | ||
| 414 | |||
| 415 | static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) | ||
| 416 | { | ||
| 417 | sector_t discard_blocks = cache->discard_block_size; | ||
| 418 | dm_block_t b = from_oblock(oblock); | ||
| 419 | |||
| 420 | if (!block_size_is_power_of_two(cache)) | ||
| 421 | (void) sector_div(discard_blocks, cache->sectors_per_block); | ||
| 422 | else | ||
| 423 | discard_blocks >>= cache->sectors_per_block_shift; | ||
| 424 | |||
| 425 | (void) sector_div(b, discard_blocks); | ||
| 426 | |||
| 427 | return to_dblock(b); | ||
| 428 | } | ||
| 429 | |||
| 430 | static void set_discard(struct cache *cache, dm_dblock_t b) | ||
| 431 | { | ||
| 432 | unsigned long flags; | ||
| 433 | |||
| 434 | atomic_inc(&cache->stats.discard_count); | ||
| 435 | |||
| 436 | spin_lock_irqsave(&cache->lock, flags); | ||
| 437 | set_bit(from_dblock(b), cache->discard_bitset); | ||
| 438 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 439 | } | ||
| 440 | |||
| 441 | static void clear_discard(struct cache *cache, dm_dblock_t b) | ||
| 442 | { | ||
| 443 | unsigned long flags; | ||
| 444 | |||
| 445 | spin_lock_irqsave(&cache->lock, flags); | ||
| 446 | clear_bit(from_dblock(b), cache->discard_bitset); | ||
| 447 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 448 | } | ||
| 449 | |||
| 450 | static bool is_discarded(struct cache *cache, dm_dblock_t b) | ||
| 451 | { | ||
| 452 | int r; | ||
| 453 | unsigned long flags; | ||
| 454 | |||
| 455 | spin_lock_irqsave(&cache->lock, flags); | ||
| 456 | r = test_bit(from_dblock(b), cache->discard_bitset); | ||
| 457 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 458 | |||
| 459 | return r; | ||
| 460 | } | ||
| 461 | |||
| 462 | static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) | ||
| 463 | { | ||
| 464 | int r; | ||
| 465 | unsigned long flags; | ||
| 466 | |||
| 467 | spin_lock_irqsave(&cache->lock, flags); | ||
| 468 | r = test_bit(from_dblock(oblock_to_dblock(cache, b)), | ||
| 469 | cache->discard_bitset); | ||
| 470 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 471 | |||
| 472 | return r; | ||
| 473 | } | ||
| 474 | |||
| 475 | /*----------------------------------------------------------------*/ | ||
| 476 | |||
| 477 | static void load_stats(struct cache *cache) | ||
| 478 | { | ||
| 479 | struct dm_cache_statistics stats; | ||
| 480 | |||
| 481 | dm_cache_metadata_get_stats(cache->cmd, &stats); | ||
| 482 | atomic_set(&cache->stats.read_hit, stats.read_hits); | ||
| 483 | atomic_set(&cache->stats.read_miss, stats.read_misses); | ||
| 484 | atomic_set(&cache->stats.write_hit, stats.write_hits); | ||
| 485 | atomic_set(&cache->stats.write_miss, stats.write_misses); | ||
| 486 | } | ||
| 487 | |||
| 488 | static void save_stats(struct cache *cache) | ||
| 489 | { | ||
| 490 | struct dm_cache_statistics stats; | ||
| 491 | |||
| 492 | stats.read_hits = atomic_read(&cache->stats.read_hit); | ||
| 493 | stats.read_misses = atomic_read(&cache->stats.read_miss); | ||
| 494 | stats.write_hits = atomic_read(&cache->stats.write_hit); | ||
| 495 | stats.write_misses = atomic_read(&cache->stats.write_miss); | ||
| 496 | |||
| 497 | dm_cache_metadata_set_stats(cache->cmd, &stats); | ||
| 498 | } | ||
| 499 | |||
| 500 | /*---------------------------------------------------------------- | ||
| 501 | * Per bio data | ||
| 502 | *--------------------------------------------------------------*/ | ||
| 503 | static struct per_bio_data *get_per_bio_data(struct bio *bio) | ||
| 504 | { | ||
| 505 | struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); | ||
| 506 | BUG_ON(!pb); | ||
| 507 | return pb; | ||
| 508 | } | ||
| 509 | |||
| 510 | static struct per_bio_data *init_per_bio_data(struct bio *bio) | ||
| 511 | { | ||
| 512 | struct per_bio_data *pb = get_per_bio_data(bio); | ||
| 513 | |||
| 514 | pb->tick = false; | ||
| 515 | pb->req_nr = dm_bio_get_target_bio_nr(bio); | ||
| 516 | pb->all_io_entry = NULL; | ||
| 517 | |||
| 518 | return pb; | ||
| 519 | } | ||
| 520 | |||
| 521 | /*---------------------------------------------------------------- | ||
| 522 | * Remapping | ||
| 523 | *--------------------------------------------------------------*/ | ||
| 524 | static void remap_to_origin(struct cache *cache, struct bio *bio) | ||
| 525 | { | ||
| 526 | bio->bi_bdev = cache->origin_dev->bdev; | ||
| 527 | } | ||
| 528 | |||
| 529 | static void remap_to_cache(struct cache *cache, struct bio *bio, | ||
| 530 | dm_cblock_t cblock) | ||
| 531 | { | ||
| 532 | sector_t bi_sector = bio->bi_sector; | ||
| 533 | |||
| 534 | bio->bi_bdev = cache->cache_dev->bdev; | ||
| 535 | if (!block_size_is_power_of_two(cache)) | ||
| 536 | bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) + | ||
| 537 | sector_div(bi_sector, cache->sectors_per_block); | ||
| 538 | else | ||
| 539 | bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) | | ||
| 540 | (bi_sector & (cache->sectors_per_block - 1)); | ||
| 541 | } | ||
| 542 | |||
| 543 | static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) | ||
| 544 | { | ||
| 545 | unsigned long flags; | ||
| 546 | struct per_bio_data *pb = get_per_bio_data(bio); | ||
| 547 | |||
| 548 | spin_lock_irqsave(&cache->lock, flags); | ||
| 549 | if (cache->need_tick_bio && | ||
| 550 | !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) { | ||
| 551 | pb->tick = true; | ||
| 552 | cache->need_tick_bio = false; | ||
| 553 | } | ||
| 554 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 555 | } | ||
| 556 | |||
| 557 | static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, | ||
| 558 | dm_oblock_t oblock) | ||
| 559 | { | ||
| 560 | check_if_tick_bio_needed(cache, bio); | ||
| 561 | remap_to_origin(cache, bio); | ||
| 562 | if (bio_data_dir(bio) == WRITE) | ||
| 563 | clear_discard(cache, oblock_to_dblock(cache, oblock)); | ||
| 564 | } | ||
| 565 | |||
| 566 | static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, | ||
| 567 | dm_oblock_t oblock, dm_cblock_t cblock) | ||
| 568 | { | ||
| 569 | remap_to_cache(cache, bio, cblock); | ||
| 570 | if (bio_data_dir(bio) == WRITE) { | ||
| 571 | set_dirty(cache, oblock, cblock); | ||
| 572 | clear_discard(cache, oblock_to_dblock(cache, oblock)); | ||
| 573 | } | ||
| 574 | } | ||
| 575 | |||
| 576 | static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) | ||
| 577 | { | ||
| 578 | sector_t block_nr = bio->bi_sector; | ||
| 579 | |||
| 580 | if (!block_size_is_power_of_two(cache)) | ||
| 581 | (void) sector_div(block_nr, cache->sectors_per_block); | ||
| 582 | else | ||
| 583 | block_nr >>= cache->sectors_per_block_shift; | ||
| 584 | |||
| 585 | return to_oblock(block_nr); | ||
| 586 | } | ||
| 587 | |||
| 588 | static int bio_triggers_commit(struct cache *cache, struct bio *bio) | ||
| 589 | { | ||
| 590 | return bio->bi_rw & (REQ_FLUSH | REQ_FUA); | ||
| 591 | } | ||
| 592 | |||
| 593 | static void issue(struct cache *cache, struct bio *bio) | ||
| 594 | { | ||
| 595 | unsigned long flags; | ||
| 596 | |||
| 597 | if (!bio_triggers_commit(cache, bio)) { | ||
| 598 | generic_make_request(bio); | ||
| 599 | return; | ||
| 600 | } | ||
| 601 | |||
| 602 | /* | ||
| 603 | * Batch together any bios that trigger commits and then issue a | ||
| 604 | * single commit for them in do_worker(). | ||
| 605 | */ | ||
| 606 | spin_lock_irqsave(&cache->lock, flags); | ||
| 607 | cache->commit_requested = true; | ||
| 608 | bio_list_add(&cache->deferred_flush_bios, bio); | ||
| 609 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 610 | } | ||
| 611 | |||
| 612 | /*---------------------------------------------------------------- | ||
| 613 | * Migration processing | ||
| 614 | * | ||
| 615 | * Migration covers moving data from the origin device to the cache, or | ||
| 616 | * vice versa. | ||
| 617 | *--------------------------------------------------------------*/ | ||
| 618 | static void free_migration(struct dm_cache_migration *mg) | ||
| 619 | { | ||
| 620 | mempool_free(mg, mg->cache->migration_pool); | ||
| 621 | } | ||
| 622 | |||
| 623 | static void inc_nr_migrations(struct cache *cache) | ||
| 624 | { | ||
| 625 | atomic_inc(&cache->nr_migrations); | ||
| 626 | } | ||
| 627 | |||
| 628 | static void dec_nr_migrations(struct cache *cache) | ||
| 629 | { | ||
| 630 | atomic_dec(&cache->nr_migrations); | ||
| 631 | |||
| 632 | /* | ||
| 633 | * Wake the worker in case we're suspending the target. | ||
| 634 | */ | ||
| 635 | wake_up(&cache->migration_wait); | ||
| 636 | } | ||
| 637 | |||
| 638 | static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, | ||
| 639 | bool holder) | ||
| 640 | { | ||
| 641 | (holder ? dm_cell_release : dm_cell_release_no_holder) | ||
| 642 | (cache->prison, cell, &cache->deferred_bios); | ||
| 643 | free_prison_cell(cache, cell); | ||
| 644 | } | ||
| 645 | |||
| 646 | static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, | ||
| 647 | bool holder) | ||
| 648 | { | ||
| 649 | unsigned long flags; | ||
| 650 | |||
| 651 | spin_lock_irqsave(&cache->lock, flags); | ||
| 652 | __cell_defer(cache, cell, holder); | ||
| 653 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 654 | |||
| 655 | wake_worker(cache); | ||
| 656 | } | ||
| 657 | |||
| 658 | static void cleanup_migration(struct dm_cache_migration *mg) | ||
| 659 | { | ||
| 660 | dec_nr_migrations(mg->cache); | ||
| 661 | free_migration(mg); | ||
| 662 | } | ||
| 663 | |||
| 664 | static void migration_failure(struct dm_cache_migration *mg) | ||
| 665 | { | ||
| 666 | struct cache *cache = mg->cache; | ||
| 667 | |||
| 668 | if (mg->writeback) { | ||
| 669 | DMWARN_LIMIT("writeback failed; couldn't copy block"); | ||
| 670 | set_dirty(cache, mg->old_oblock, mg->cblock); | ||
| 671 | cell_defer(cache, mg->old_ocell, false); | ||
| 672 | |||
| 673 | } else if (mg->demote) { | ||
| 674 | DMWARN_LIMIT("demotion failed; couldn't copy block"); | ||
| 675 | policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); | ||
| 676 | |||
| 677 | cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); | ||
| 678 | if (mg->promote) | ||
| 679 | cell_defer(cache, mg->new_ocell, 1); | ||
| 680 | } else { | ||
| 681 | DMWARN_LIMIT("promotion failed; couldn't copy block"); | ||
| 682 | policy_remove_mapping(cache->policy, mg->new_oblock); | ||
| 683 | cell_defer(cache, mg->new_ocell, 1); | ||
| 684 | } | ||
| 685 | |||
| 686 | cleanup_migration(mg); | ||
| 687 | } | ||
| 688 | |||
| 689 | static void migration_success_pre_commit(struct dm_cache_migration *mg) | ||
| 690 | { | ||
| 691 | unsigned long flags; | ||
| 692 | struct cache *cache = mg->cache; | ||
| 693 | |||
| 694 | if (mg->writeback) { | ||
| 695 | cell_defer(cache, mg->old_ocell, false); | ||
| 696 | clear_dirty(cache, mg->old_oblock, mg->cblock); | ||
| 697 | cleanup_migration(mg); | ||
| 698 | return; | ||
| 699 | |||
| 700 | } else if (mg->demote) { | ||
| 701 | if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) { | ||
| 702 | DMWARN_LIMIT("demotion failed; couldn't update on disk metadata"); | ||
| 703 | policy_force_mapping(cache->policy, mg->new_oblock, | ||
| 704 | mg->old_oblock); | ||
| 705 | if (mg->promote) | ||
| 706 | cell_defer(cache, mg->new_ocell, true); | ||
| 707 | cleanup_migration(mg); | ||
| 708 | return; | ||
| 709 | } | ||
| 710 | } else { | ||
| 711 | if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) { | ||
| 712 | DMWARN_LIMIT("promotion failed; couldn't update on disk metadata"); | ||
| 713 | policy_remove_mapping(cache->policy, mg->new_oblock); | ||
| 714 | cleanup_migration(mg); | ||
| 715 | return; | ||
| 716 | } | ||
| 717 | } | ||
| 718 | |||
| 719 | spin_lock_irqsave(&cache->lock, flags); | ||
| 720 | list_add_tail(&mg->list, &cache->need_commit_migrations); | ||
| 721 | cache->commit_requested = true; | ||
| 722 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 723 | } | ||
| 724 | |||
| 725 | static void migration_success_post_commit(struct dm_cache_migration *mg) | ||
| 726 | { | ||
| 727 | unsigned long flags; | ||
| 728 | struct cache *cache = mg->cache; | ||
| 729 | |||
| 730 | if (mg->writeback) { | ||
| 731 | DMWARN("writeback unexpectedly triggered commit"); | ||
| 732 | return; | ||
| 733 | |||
| 734 | } else if (mg->demote) { | ||
| 735 | cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); | ||
| 736 | |||
| 737 | if (mg->promote) { | ||
| 738 | mg->demote = false; | ||
| 739 | |||
| 740 | spin_lock_irqsave(&cache->lock, flags); | ||
| 741 | list_add_tail(&mg->list, &cache->quiesced_migrations); | ||
| 742 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 743 | |||
| 744 | } else | ||
| 745 | cleanup_migration(mg); | ||
| 746 | |||
| 747 | } else { | ||
| 748 | cell_defer(cache, mg->new_ocell, true); | ||
| 749 | clear_dirty(cache, mg->new_oblock, mg->cblock); | ||
| 750 | cleanup_migration(mg); | ||
| 751 | } | ||
| 752 | } | ||
| 753 | |||
| 754 | static void copy_complete(int read_err, unsigned long write_err, void *context) | ||
| 755 | { | ||
| 756 | unsigned long flags; | ||
| 757 | struct dm_cache_migration *mg = (struct dm_cache_migration *) context; | ||
| 758 | struct cache *cache = mg->cache; | ||
| 759 | |||
| 760 | if (read_err || write_err) | ||
| 761 | mg->err = true; | ||
| 762 | |||
| 763 | spin_lock_irqsave(&cache->lock, flags); | ||
| 764 | list_add_tail(&mg->list, &cache->completed_migrations); | ||
| 765 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 766 | |||
| 767 | wake_worker(cache); | ||
| 768 | } | ||
| 769 | |||
| 770 | static void issue_copy_real(struct dm_cache_migration *mg) | ||
| 771 | { | ||
| 772 | int r; | ||
| 773 | struct dm_io_region o_region, c_region; | ||
| 774 | struct cache *cache = mg->cache; | ||
| 775 | |||
| 776 | o_region.bdev = cache->origin_dev->bdev; | ||
| 777 | o_region.count = cache->sectors_per_block; | ||
| 778 | |||
| 779 | c_region.bdev = cache->cache_dev->bdev; | ||
| 780 | c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block; | ||
| 781 | c_region.count = cache->sectors_per_block; | ||
| 782 | |||
| 783 | if (mg->writeback || mg->demote) { | ||
| 784 | /* demote */ | ||
| 785 | o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; | ||
| 786 | r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); | ||
| 787 | } else { | ||
| 788 | /* promote */ | ||
| 789 | o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; | ||
| 790 | r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); | ||
| 791 | } | ||
| 792 | |||
| 793 | if (r < 0) | ||
| 794 | migration_failure(mg); | ||
| 795 | } | ||
| 796 | |||
| 797 | static void avoid_copy(struct dm_cache_migration *mg) | ||
| 798 | { | ||
| 799 | atomic_inc(&mg->cache->stats.copies_avoided); | ||
| 800 | migration_success_pre_commit(mg); | ||
| 801 | } | ||
| 802 | |||
| 803 | static void issue_copy(struct dm_cache_migration *mg) | ||
| 804 | { | ||
| 805 | bool avoid; | ||
| 806 | struct cache *cache = mg->cache; | ||
| 807 | |||
| 808 | if (mg->writeback || mg->demote) | ||
| 809 | avoid = !is_dirty(cache, mg->cblock) || | ||
| 810 | is_discarded_oblock(cache, mg->old_oblock); | ||
| 811 | else | ||
| 812 | avoid = is_discarded_oblock(cache, mg->new_oblock); | ||
| 813 | |||
| 814 | avoid ? avoid_copy(mg) : issue_copy_real(mg); | ||
| 815 | } | ||
| 816 | |||
| 817 | static void complete_migration(struct dm_cache_migration *mg) | ||
| 818 | { | ||
| 819 | if (mg->err) | ||
| 820 | migration_failure(mg); | ||
| 821 | else | ||
| 822 | migration_success_pre_commit(mg); | ||
| 823 | } | ||
| 824 | |||
| 825 | static void process_migrations(struct cache *cache, struct list_head *head, | ||
| 826 | void (*fn)(struct dm_cache_migration *)) | ||
| 827 | { | ||
| 828 | unsigned long flags; | ||
| 829 | struct list_head list; | ||
| 830 | struct dm_cache_migration *mg, *tmp; | ||
| 831 | |||
| 832 | INIT_LIST_HEAD(&list); | ||
| 833 | spin_lock_irqsave(&cache->lock, flags); | ||
| 834 | list_splice_init(head, &list); | ||
| 835 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 836 | |||
| 837 | list_for_each_entry_safe(mg, tmp, &list, list) | ||
| 838 | fn(mg); | ||
| 839 | } | ||
| 840 | |||
| 841 | static void __queue_quiesced_migration(struct dm_cache_migration *mg) | ||
| 842 | { | ||
| 843 | list_add_tail(&mg->list, &mg->cache->quiesced_migrations); | ||
| 844 | } | ||
| 845 | |||
| 846 | static void queue_quiesced_migration(struct dm_cache_migration *mg) | ||
| 847 | { | ||
| 848 | unsigned long flags; | ||
| 849 | struct cache *cache = mg->cache; | ||
| 850 | |||
| 851 | spin_lock_irqsave(&cache->lock, flags); | ||
| 852 | __queue_quiesced_migration(mg); | ||
| 853 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 854 | |||
| 855 | wake_worker(cache); | ||
| 856 | } | ||
| 857 | |||
| 858 | static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) | ||
| 859 | { | ||
| 860 | unsigned long flags; | ||
| 861 | struct dm_cache_migration *mg, *tmp; | ||
| 862 | |||
| 863 | spin_lock_irqsave(&cache->lock, flags); | ||
| 864 | list_for_each_entry_safe(mg, tmp, work, list) | ||
| 865 | __queue_quiesced_migration(mg); | ||
| 866 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 867 | |||
| 868 | wake_worker(cache); | ||
| 869 | } | ||
| 870 | |||
| 871 | static void check_for_quiesced_migrations(struct cache *cache, | ||
| 872 | struct per_bio_data *pb) | ||
| 873 | { | ||
| 874 | struct list_head work; | ||
| 875 | |||
| 876 | if (!pb->all_io_entry) | ||
| 877 | return; | ||
| 878 | |||
| 879 | INIT_LIST_HEAD(&work); | ||
| 880 | if (pb->all_io_entry) | ||
| 881 | dm_deferred_entry_dec(pb->all_io_entry, &work); | ||
| 882 | |||
| 883 | if (!list_empty(&work)) | ||
| 884 | queue_quiesced_migrations(cache, &work); | ||
| 885 | } | ||
| 886 | |||
| 887 | static void quiesce_migration(struct dm_cache_migration *mg) | ||
| 888 | { | ||
| 889 | if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) | ||
| 890 | queue_quiesced_migration(mg); | ||
| 891 | } | ||
| 892 | |||
| 893 | static void promote(struct cache *cache, struct prealloc *structs, | ||
| 894 | dm_oblock_t oblock, dm_cblock_t cblock, | ||
| 895 | struct dm_bio_prison_cell *cell) | ||
| 896 | { | ||
| 897 | struct dm_cache_migration *mg = prealloc_get_migration(structs); | ||
| 898 | |||
| 899 | mg->err = false; | ||
| 900 | mg->writeback = false; | ||
| 901 | mg->demote = false; | ||
| 902 | mg->promote = true; | ||
| 903 | mg->cache = cache; | ||
| 904 | mg->new_oblock = oblock; | ||
| 905 | mg->cblock = cblock; | ||
| 906 | mg->old_ocell = NULL; | ||
| 907 | mg->new_ocell = cell; | ||
| 908 | mg->start_jiffies = jiffies; | ||
| 909 | |||
| 910 | inc_nr_migrations(cache); | ||
| 911 | quiesce_migration(mg); | ||
| 912 | } | ||
| 913 | |||
| 914 | static void writeback(struct cache *cache, struct prealloc *structs, | ||
| 915 | dm_oblock_t oblock, dm_cblock_t cblock, | ||
| 916 | struct dm_bio_prison_cell *cell) | ||
| 917 | { | ||
| 918 | struct dm_cache_migration *mg = prealloc_get_migration(structs); | ||
| 919 | |||
| 920 | mg->err = false; | ||
| 921 | mg->writeback = true; | ||
| 922 | mg->demote = false; | ||
| 923 | mg->promote = false; | ||
| 924 | mg->cache = cache; | ||
| 925 | mg->old_oblock = oblock; | ||
| 926 | mg->cblock = cblock; | ||
| 927 | mg->old_ocell = cell; | ||
| 928 | mg->new_ocell = NULL; | ||
| 929 | mg->start_jiffies = jiffies; | ||
| 930 | |||
| 931 | inc_nr_migrations(cache); | ||
| 932 | quiesce_migration(mg); | ||
| 933 | } | ||
| 934 | |||
| 935 | static void demote_then_promote(struct cache *cache, struct prealloc *structs, | ||
| 936 | dm_oblock_t old_oblock, dm_oblock_t new_oblock, | ||
| 937 | dm_cblock_t cblock, | ||
| 938 | struct dm_bio_prison_cell *old_ocell, | ||
| 939 | struct dm_bio_prison_cell *new_ocell) | ||
| 940 | { | ||
| 941 | struct dm_cache_migration *mg = prealloc_get_migration(structs); | ||
| 942 | |||
| 943 | mg->err = false; | ||
| 944 | mg->writeback = false; | ||
| 945 | mg->demote = true; | ||
| 946 | mg->promote = true; | ||
| 947 | mg->cache = cache; | ||
| 948 | mg->old_oblock = old_oblock; | ||
| 949 | mg->new_oblock = new_oblock; | ||
| 950 | mg->cblock = cblock; | ||
| 951 | mg->old_ocell = old_ocell; | ||
| 952 | mg->new_ocell = new_ocell; | ||
| 953 | mg->start_jiffies = jiffies; | ||
| 954 | |||
| 955 | inc_nr_migrations(cache); | ||
| 956 | quiesce_migration(mg); | ||
| 957 | } | ||
| 958 | |||
| 959 | /*---------------------------------------------------------------- | ||
| 960 | * bio processing | ||
| 961 | *--------------------------------------------------------------*/ | ||
| 962 | static void defer_bio(struct cache *cache, struct bio *bio) | ||
| 963 | { | ||
| 964 | unsigned long flags; | ||
| 965 | |||
| 966 | spin_lock_irqsave(&cache->lock, flags); | ||
| 967 | bio_list_add(&cache->deferred_bios, bio); | ||
| 968 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 969 | |||
| 970 | wake_worker(cache); | ||
| 971 | } | ||
| 972 | |||
| 973 | static void process_flush_bio(struct cache *cache, struct bio *bio) | ||
| 974 | { | ||
| 975 | struct per_bio_data *pb = get_per_bio_data(bio); | ||
| 976 | |||
| 977 | BUG_ON(bio->bi_size); | ||
| 978 | if (!pb->req_nr) | ||
| 979 | remap_to_origin(cache, bio); | ||
| 980 | else | ||
| 981 | remap_to_cache(cache, bio, 0); | ||
| 982 | |||
| 983 | issue(cache, bio); | ||
| 984 | } | ||
| 985 | |||
| 986 | /* | ||
| 987 | * People generally discard large parts of a device, eg, the whole device | ||
| 988 | * when formatting. Splitting these large discards up into cache block | ||
| 989 | * sized ios and then quiescing (always neccessary for discard) takes too | ||
| 990 | * long. | ||
| 991 | * | ||
| 992 | * We keep it simple, and allow any size of discard to come in, and just | ||
| 993 | * mark off blocks on the discard bitset. No passdown occurs! | ||
| 994 | * | ||
| 995 | * To implement passdown we need to change the bio_prison such that a cell | ||
| 996 | * can have a key that spans many blocks. | ||
| 997 | */ | ||
| 998 | static void process_discard_bio(struct cache *cache, struct bio *bio) | ||
| 999 | { | ||
| 1000 | dm_block_t start_block = dm_sector_div_up(bio->bi_sector, | ||
| 1001 | cache->discard_block_size); | ||
| 1002 | dm_block_t end_block = bio->bi_sector + bio_sectors(bio); | ||
| 1003 | dm_block_t b; | ||
| 1004 | |||
| 1005 | (void) sector_div(end_block, cache->discard_block_size); | ||
| 1006 | |||
| 1007 | for (b = start_block; b < end_block; b++) | ||
| 1008 | set_discard(cache, to_dblock(b)); | ||
| 1009 | |||
| 1010 | bio_endio(bio, 0); | ||
| 1011 | } | ||
| 1012 | |||
| 1013 | static bool spare_migration_bandwidth(struct cache *cache) | ||
| 1014 | { | ||
| 1015 | sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) * | ||
| 1016 | cache->sectors_per_block; | ||
| 1017 | return current_volume < cache->migration_threshold; | ||
| 1018 | } | ||
| 1019 | |||
| 1020 | static bool is_writethrough_io(struct cache *cache, struct bio *bio, | ||
| 1021 | dm_cblock_t cblock) | ||
| 1022 | { | ||
| 1023 | return bio_data_dir(bio) == WRITE && | ||
| 1024 | cache->features.write_through && !is_dirty(cache, cblock); | ||
| 1025 | } | ||
| 1026 | |||
| 1027 | static void inc_hit_counter(struct cache *cache, struct bio *bio) | ||
| 1028 | { | ||
| 1029 | atomic_inc(bio_data_dir(bio) == READ ? | ||
| 1030 | &cache->stats.read_hit : &cache->stats.write_hit); | ||
| 1031 | } | ||
| 1032 | |||
| 1033 | static void inc_miss_counter(struct cache *cache, struct bio *bio) | ||
| 1034 | { | ||
| 1035 | atomic_inc(bio_data_dir(bio) == READ ? | ||
| 1036 | &cache->stats.read_miss : &cache->stats.write_miss); | ||
| 1037 | } | ||
| 1038 | |||
| 1039 | static void process_bio(struct cache *cache, struct prealloc *structs, | ||
| 1040 | struct bio *bio) | ||
| 1041 | { | ||
| 1042 | int r; | ||
| 1043 | bool release_cell = true; | ||
| 1044 | dm_oblock_t block = get_bio_block(cache, bio); | ||
| 1045 | struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; | ||
| 1046 | struct policy_result lookup_result; | ||
| 1047 | struct per_bio_data *pb = get_per_bio_data(bio); | ||
| 1048 | bool discarded_block = is_discarded_oblock(cache, block); | ||
| 1049 | bool can_migrate = discarded_block || spare_migration_bandwidth(cache); | ||
| 1050 | |||
| 1051 | /* | ||
| 1052 | * Check to see if that block is currently migrating. | ||
| 1053 | */ | ||
| 1054 | cell_prealloc = prealloc_get_cell(structs); | ||
| 1055 | r = bio_detain(cache, block, bio, cell_prealloc, | ||
| 1056 | (cell_free_fn) prealloc_put_cell, | ||
| 1057 | structs, &new_ocell); | ||
| 1058 | if (r > 0) | ||
| 1059 | return; | ||
| 1060 | |||
| 1061 | r = policy_map(cache->policy, block, true, can_migrate, discarded_block, | ||
| 1062 | bio, &lookup_result); | ||
| 1063 | |||
| 1064 | if (r == -EWOULDBLOCK) | ||
| 1065 | /* migration has been denied */ | ||
| 1066 | lookup_result.op = POLICY_MISS; | ||
| 1067 | |||
| 1068 | switch (lookup_result.op) { | ||
| 1069 | case POLICY_HIT: | ||
| 1070 | inc_hit_counter(cache, bio); | ||
| 1071 | pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); | ||
| 1072 | |||
| 1073 | if (is_writethrough_io(cache, bio, lookup_result.cblock)) { | ||
| 1074 | /* | ||
| 1075 | * No need to mark anything dirty in write through mode. | ||
| 1076 | */ | ||
| 1077 | pb->req_nr == 0 ? | ||
| 1078 | remap_to_cache(cache, bio, lookup_result.cblock) : | ||
| 1079 | remap_to_origin_clear_discard(cache, bio, block); | ||
| 1080 | } else | ||
| 1081 | remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); | ||
| 1082 | |||
| 1083 | issue(cache, bio); | ||
| 1084 | break; | ||
| 1085 | |||
| 1086 | case POLICY_MISS: | ||
| 1087 | inc_miss_counter(cache, bio); | ||
| 1088 | pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); | ||
| 1089 | |||
| 1090 | if (pb->req_nr != 0) { | ||
| 1091 | /* | ||
| 1092 | * This is a duplicate writethrough io that is no | ||
| 1093 | * longer needed because the block has been demoted. | ||
| 1094 | */ | ||
| 1095 | bio_endio(bio, 0); | ||
| 1096 | } else { | ||
| 1097 | remap_to_origin_clear_discard(cache, bio, block); | ||
| 1098 | issue(cache, bio); | ||
| 1099 | } | ||
| 1100 | break; | ||
| 1101 | |||
| 1102 | case POLICY_NEW: | ||
| 1103 | atomic_inc(&cache->stats.promotion); | ||
| 1104 | promote(cache, structs, block, lookup_result.cblock, new_ocell); | ||
| 1105 | release_cell = false; | ||
| 1106 | break; | ||
| 1107 | |||
| 1108 | case POLICY_REPLACE: | ||
| 1109 | cell_prealloc = prealloc_get_cell(structs); | ||
| 1110 | r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc, | ||
| 1111 | (cell_free_fn) prealloc_put_cell, | ||
| 1112 | structs, &old_ocell); | ||
| 1113 | if (r > 0) { | ||
| 1114 | /* | ||
| 1115 | * We have to be careful to avoid lock inversion of | ||
| 1116 | * the cells. So we back off, and wait for the | ||
| 1117 | * old_ocell to become free. | ||
| 1118 | */ | ||
| 1119 | policy_force_mapping(cache->policy, block, | ||
| 1120 | lookup_result.old_oblock); | ||
| 1121 | atomic_inc(&cache->stats.cache_cell_clash); | ||
| 1122 | break; | ||
| 1123 | } | ||
| 1124 | atomic_inc(&cache->stats.demotion); | ||
| 1125 | atomic_inc(&cache->stats.promotion); | ||
| 1126 | |||
| 1127 | demote_then_promote(cache, structs, lookup_result.old_oblock, | ||
| 1128 | block, lookup_result.cblock, | ||
| 1129 | old_ocell, new_ocell); | ||
| 1130 | release_cell = false; | ||
| 1131 | break; | ||
| 1132 | |||
| 1133 | default: | ||
| 1134 | DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__, | ||
| 1135 | (unsigned) lookup_result.op); | ||
| 1136 | bio_io_error(bio); | ||
| 1137 | } | ||
| 1138 | |||
| 1139 | if (release_cell) | ||
| 1140 | cell_defer(cache, new_ocell, false); | ||
| 1141 | } | ||
| 1142 | |||
| 1143 | static int need_commit_due_to_time(struct cache *cache) | ||
| 1144 | { | ||
| 1145 | return jiffies < cache->last_commit_jiffies || | ||
| 1146 | jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; | ||
| 1147 | } | ||
| 1148 | |||
| 1149 | static int commit_if_needed(struct cache *cache) | ||
| 1150 | { | ||
| 1151 | if (dm_cache_changed_this_transaction(cache->cmd) && | ||
| 1152 | (cache->commit_requested || need_commit_due_to_time(cache))) { | ||
| 1153 | atomic_inc(&cache->stats.commit_count); | ||
| 1154 | cache->last_commit_jiffies = jiffies; | ||
| 1155 | cache->commit_requested = false; | ||
| 1156 | return dm_cache_commit(cache->cmd, false); | ||
| 1157 | } | ||
| 1158 | |||
| 1159 | return 0; | ||
| 1160 | } | ||
| 1161 | |||
| 1162 | static void process_deferred_bios(struct cache *cache) | ||
| 1163 | { | ||
| 1164 | unsigned long flags; | ||
| 1165 | struct bio_list bios; | ||
| 1166 | struct bio *bio; | ||
| 1167 | struct prealloc structs; | ||
| 1168 | |||
| 1169 | memset(&structs, 0, sizeof(structs)); | ||
| 1170 | bio_list_init(&bios); | ||
| 1171 | |||
| 1172 | spin_lock_irqsave(&cache->lock, flags); | ||
| 1173 | bio_list_merge(&bios, &cache->deferred_bios); | ||
| 1174 | bio_list_init(&cache->deferred_bios); | ||
| 1175 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 1176 | |||
| 1177 | while (!bio_list_empty(&bios)) { | ||
| 1178 | /* | ||
| 1179 | * If we've got no free migration structs, and processing | ||
| 1180 | * this bio might require one, we pause until there are some | ||
| 1181 | * prepared mappings to process. | ||
| 1182 | */ | ||
| 1183 | if (prealloc_data_structs(cache, &structs)) { | ||
| 1184 | spin_lock_irqsave(&cache->lock, flags); | ||
| 1185 | bio_list_merge(&cache->deferred_bios, &bios); | ||
| 1186 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 1187 | break; | ||
| 1188 | } | ||
| 1189 | |||
| 1190 | bio = bio_list_pop(&bios); | ||
| 1191 | |||
| 1192 | if (bio->bi_rw & REQ_FLUSH) | ||
| 1193 | process_flush_bio(cache, bio); | ||
| 1194 | else if (bio->bi_rw & REQ_DISCARD) | ||
| 1195 | process_discard_bio(cache, bio); | ||
| 1196 | else | ||
| 1197 | process_bio(cache, &structs, bio); | ||
| 1198 | } | ||
| 1199 | |||
| 1200 | prealloc_free_structs(cache, &structs); | ||
| 1201 | } | ||
| 1202 | |||
| 1203 | static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) | ||
| 1204 | { | ||
| 1205 | unsigned long flags; | ||
| 1206 | struct bio_list bios; | ||
| 1207 | struct bio *bio; | ||
| 1208 | |||
| 1209 | bio_list_init(&bios); | ||
| 1210 | |||
| 1211 | spin_lock_irqsave(&cache->lock, flags); | ||
| 1212 | bio_list_merge(&bios, &cache->deferred_flush_bios); | ||
| 1213 | bio_list_init(&cache->deferred_flush_bios); | ||
| 1214 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 1215 | |||
| 1216 | while ((bio = bio_list_pop(&bios))) | ||
| 1217 | submit_bios ? generic_make_request(bio) : bio_io_error(bio); | ||
| 1218 | } | ||
| 1219 | |||
| 1220 | static void writeback_some_dirty_blocks(struct cache *cache) | ||
| 1221 | { | ||
| 1222 | int r = 0; | ||
| 1223 | dm_oblock_t oblock; | ||
| 1224 | dm_cblock_t cblock; | ||
| 1225 | struct prealloc structs; | ||
| 1226 | struct dm_bio_prison_cell *old_ocell; | ||
| 1227 | |||
| 1228 | memset(&structs, 0, sizeof(structs)); | ||
| 1229 | |||
| 1230 | while (spare_migration_bandwidth(cache)) { | ||
| 1231 | if (prealloc_data_structs(cache, &structs)) | ||
| 1232 | break; | ||
| 1233 | |||
| 1234 | r = policy_writeback_work(cache->policy, &oblock, &cblock); | ||
| 1235 | if (r) | ||
| 1236 | break; | ||
| 1237 | |||
| 1238 | r = get_cell(cache, oblock, &structs, &old_ocell); | ||
| 1239 | if (r) { | ||
| 1240 | policy_set_dirty(cache->policy, oblock); | ||
| 1241 | break; | ||
| 1242 | } | ||
| 1243 | |||
| 1244 | writeback(cache, &structs, oblock, cblock, old_ocell); | ||
| 1245 | } | ||
| 1246 | |||
| 1247 | prealloc_free_structs(cache, &structs); | ||
| 1248 | } | ||
| 1249 | |||
| 1250 | /*---------------------------------------------------------------- | ||
| 1251 | * Main worker loop | ||
| 1252 | *--------------------------------------------------------------*/ | ||
| 1253 | static void start_quiescing(struct cache *cache) | ||
| 1254 | { | ||
| 1255 | unsigned long flags; | ||
| 1256 | |||
| 1257 | spin_lock_irqsave(&cache->lock, flags); | ||
| 1258 | cache->quiescing = 1; | ||
| 1259 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 1260 | } | ||
| 1261 | |||
| 1262 | static void stop_quiescing(struct cache *cache) | ||
| 1263 | { | ||
| 1264 | unsigned long flags; | ||
| 1265 | |||
| 1266 | spin_lock_irqsave(&cache->lock, flags); | ||
| 1267 | cache->quiescing = 0; | ||
| 1268 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 1269 | } | ||
| 1270 | |||
| 1271 | static bool is_quiescing(struct cache *cache) | ||
| 1272 | { | ||
| 1273 | int r; | ||
| 1274 | unsigned long flags; | ||
| 1275 | |||
| 1276 | spin_lock_irqsave(&cache->lock, flags); | ||
| 1277 | r = cache->quiescing; | ||
| 1278 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 1279 | |||
| 1280 | return r; | ||
| 1281 | } | ||
| 1282 | |||
| 1283 | static void wait_for_migrations(struct cache *cache) | ||
| 1284 | { | ||
| 1285 | wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations)); | ||
| 1286 | } | ||
| 1287 | |||
| 1288 | static void stop_worker(struct cache *cache) | ||
| 1289 | { | ||
| 1290 | cancel_delayed_work(&cache->waker); | ||
| 1291 | flush_workqueue(cache->wq); | ||
| 1292 | } | ||
| 1293 | |||
| 1294 | static void requeue_deferred_io(struct cache *cache) | ||
| 1295 | { | ||
| 1296 | struct bio *bio; | ||
| 1297 | struct bio_list bios; | ||
| 1298 | |||
| 1299 | bio_list_init(&bios); | ||
| 1300 | bio_list_merge(&bios, &cache->deferred_bios); | ||
| 1301 | bio_list_init(&cache->deferred_bios); | ||
| 1302 | |||
| 1303 | while ((bio = bio_list_pop(&bios))) | ||
| 1304 | bio_endio(bio, DM_ENDIO_REQUEUE); | ||
| 1305 | } | ||
| 1306 | |||
| 1307 | static int more_work(struct cache *cache) | ||
| 1308 | { | ||
| 1309 | if (is_quiescing(cache)) | ||
| 1310 | return !list_empty(&cache->quiesced_migrations) || | ||
| 1311 | !list_empty(&cache->completed_migrations) || | ||
| 1312 | !list_empty(&cache->need_commit_migrations); | ||
| 1313 | else | ||
| 1314 | return !bio_list_empty(&cache->deferred_bios) || | ||
| 1315 | !bio_list_empty(&cache->deferred_flush_bios) || | ||
| 1316 | !list_empty(&cache->quiesced_migrations) || | ||
| 1317 | !list_empty(&cache->completed_migrations) || | ||
| 1318 | !list_empty(&cache->need_commit_migrations); | ||
| 1319 | } | ||
| 1320 | |||
| 1321 | static void do_worker(struct work_struct *ws) | ||
| 1322 | { | ||
| 1323 | struct cache *cache = container_of(ws, struct cache, worker); | ||
| 1324 | |||
| 1325 | do { | ||
| 1326 | if (!is_quiescing(cache)) | ||
| 1327 | process_deferred_bios(cache); | ||
| 1328 | |||
| 1329 | process_migrations(cache, &cache->quiesced_migrations, issue_copy); | ||
| 1330 | process_migrations(cache, &cache->completed_migrations, complete_migration); | ||
| 1331 | |||
| 1332 | writeback_some_dirty_blocks(cache); | ||
| 1333 | |||
| 1334 | if (commit_if_needed(cache)) { | ||
| 1335 | process_deferred_flush_bios(cache, false); | ||
| 1336 | |||
| 1337 | /* | ||
| 1338 | * FIXME: rollback metadata or just go into a | ||
| 1339 | * failure mode and error everything | ||
| 1340 | */ | ||
| 1341 | } else { | ||
| 1342 | process_deferred_flush_bios(cache, true); | ||
| 1343 | process_migrations(cache, &cache->need_commit_migrations, | ||
| 1344 | migration_success_post_commit); | ||
| 1345 | } | ||
| 1346 | } while (more_work(cache)); | ||
| 1347 | } | ||
| 1348 | |||
| 1349 | /* | ||
| 1350 | * We want to commit periodically so that not too much | ||
| 1351 | * unwritten metadata builds up. | ||
| 1352 | */ | ||
| 1353 | static void do_waker(struct work_struct *ws) | ||
| 1354 | { | ||
| 1355 | struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); | ||
| 1356 | wake_worker(cache); | ||
| 1357 | queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); | ||
| 1358 | } | ||
| 1359 | |||
| 1360 | /*----------------------------------------------------------------*/ | ||
| 1361 | |||
| 1362 | static int is_congested(struct dm_dev *dev, int bdi_bits) | ||
| 1363 | { | ||
| 1364 | struct request_queue *q = bdev_get_queue(dev->bdev); | ||
| 1365 | return bdi_congested(&q->backing_dev_info, bdi_bits); | ||
| 1366 | } | ||
| 1367 | |||
| 1368 | static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) | ||
| 1369 | { | ||
| 1370 | struct cache *cache = container_of(cb, struct cache, callbacks); | ||
| 1371 | |||
| 1372 | return is_congested(cache->origin_dev, bdi_bits) || | ||
| 1373 | is_congested(cache->cache_dev, bdi_bits); | ||
| 1374 | } | ||
| 1375 | |||
| 1376 | /*---------------------------------------------------------------- | ||
| 1377 | * Target methods | ||
| 1378 | *--------------------------------------------------------------*/ | ||
| 1379 | |||
| 1380 | /* | ||
| 1381 | * This function gets called on the error paths of the constructor, so we | ||
| 1382 | * have to cope with a partially initialised struct. | ||
| 1383 | */ | ||
| 1384 | static void destroy(struct cache *cache) | ||
| 1385 | { | ||
| 1386 | unsigned i; | ||
| 1387 | |||
| 1388 | if (cache->next_migration) | ||
| 1389 | mempool_free(cache->next_migration, cache->migration_pool); | ||
| 1390 | |||
| 1391 | if (cache->migration_pool) | ||
| 1392 | mempool_destroy(cache->migration_pool); | ||
| 1393 | |||
| 1394 | if (cache->all_io_ds) | ||
| 1395 | dm_deferred_set_destroy(cache->all_io_ds); | ||
| 1396 | |||
| 1397 | if (cache->prison) | ||
| 1398 | dm_bio_prison_destroy(cache->prison); | ||
| 1399 | |||
| 1400 | if (cache->wq) | ||
| 1401 | destroy_workqueue(cache->wq); | ||
| 1402 | |||
| 1403 | if (cache->dirty_bitset) | ||
| 1404 | free_bitset(cache->dirty_bitset); | ||
| 1405 | |||
| 1406 | if (cache->discard_bitset) | ||
| 1407 | free_bitset(cache->discard_bitset); | ||
| 1408 | |||
| 1409 | if (cache->copier) | ||
| 1410 | dm_kcopyd_client_destroy(cache->copier); | ||
| 1411 | |||
| 1412 | if (cache->cmd) | ||
| 1413 | dm_cache_metadata_close(cache->cmd); | ||
| 1414 | |||
| 1415 | if (cache->metadata_dev) | ||
| 1416 | dm_put_device(cache->ti, cache->metadata_dev); | ||
| 1417 | |||
| 1418 | if (cache->origin_dev) | ||
| 1419 | dm_put_device(cache->ti, cache->origin_dev); | ||
| 1420 | |||
| 1421 | if (cache->cache_dev) | ||
| 1422 | dm_put_device(cache->ti, cache->cache_dev); | ||
| 1423 | |||
| 1424 | if (cache->policy) | ||
| 1425 | dm_cache_policy_destroy(cache->policy); | ||
| 1426 | |||
| 1427 | for (i = 0; i < cache->nr_ctr_args ; i++) | ||
| 1428 | kfree(cache->ctr_args[i]); | ||
| 1429 | kfree(cache->ctr_args); | ||
| 1430 | |||
| 1431 | kfree(cache); | ||
| 1432 | } | ||
| 1433 | |||
| 1434 | static void cache_dtr(struct dm_target *ti) | ||
| 1435 | { | ||
| 1436 | struct cache *cache = ti->private; | ||
| 1437 | |||
| 1438 | destroy(cache); | ||
| 1439 | } | ||
| 1440 | |||
| 1441 | static sector_t get_dev_size(struct dm_dev *dev) | ||
| 1442 | { | ||
| 1443 | return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; | ||
| 1444 | } | ||
| 1445 | |||
| 1446 | /*----------------------------------------------------------------*/ | ||
| 1447 | |||
| 1448 | /* | ||
| 1449 | * Construct a cache device mapping. | ||
| 1450 | * | ||
| 1451 | * cache <metadata dev> <cache dev> <origin dev> <block size> | ||
| 1452 | * <#feature args> [<feature arg>]* | ||
| 1453 | * <policy> <#policy args> [<policy arg>]* | ||
| 1454 | * | ||
| 1455 | * metadata dev : fast device holding the persistent metadata | ||
| 1456 | * cache dev : fast device holding cached data blocks | ||
| 1457 | * origin dev : slow device holding original data blocks | ||
| 1458 | * block size : cache unit size in sectors | ||
| 1459 | * | ||
| 1460 | * #feature args : number of feature arguments passed | ||
| 1461 | * feature args : writethrough. (The default is writeback.) | ||
| 1462 | * | ||
| 1463 | * policy : the replacement policy to use | ||
| 1464 | * #policy args : an even number of policy arguments corresponding | ||
| 1465 | * to key/value pairs passed to the policy | ||
| 1466 | * policy args : key/value pairs passed to the policy | ||
| 1467 | * E.g. 'sequential_threshold 1024' | ||
| 1468 | * See cache-policies.txt for details. | ||
| 1469 | * | ||
| 1470 | * Optional feature arguments are: | ||
| 1471 | * writethrough : write through caching that prohibits cache block | ||
| 1472 | * content from being different from origin block content. | ||
| 1473 | * Without this argument, the default behaviour is to write | ||
| 1474 | * back cache block contents later for performance reasons, | ||
| 1475 | * so they may differ from the corresponding origin blocks. | ||
| 1476 | */ | ||
| 1477 | struct cache_args { | ||
| 1478 | struct dm_target *ti; | ||
| 1479 | |||
| 1480 | struct dm_dev *metadata_dev; | ||
| 1481 | |||
| 1482 | struct dm_dev *cache_dev; | ||
| 1483 | sector_t cache_sectors; | ||
| 1484 | |||
| 1485 | struct dm_dev *origin_dev; | ||
| 1486 | sector_t origin_sectors; | ||
| 1487 | |||
| 1488 | uint32_t block_size; | ||
| 1489 | |||
| 1490 | const char *policy_name; | ||
| 1491 | int policy_argc; | ||
| 1492 | const char **policy_argv; | ||
| 1493 | |||
| 1494 | struct cache_features features; | ||
| 1495 | }; | ||
| 1496 | |||
| 1497 | static void destroy_cache_args(struct cache_args *ca) | ||
| 1498 | { | ||
| 1499 | if (ca->metadata_dev) | ||
| 1500 | dm_put_device(ca->ti, ca->metadata_dev); | ||
| 1501 | |||
| 1502 | if (ca->cache_dev) | ||
| 1503 | dm_put_device(ca->ti, ca->cache_dev); | ||
| 1504 | |||
| 1505 | if (ca->origin_dev) | ||
| 1506 | dm_put_device(ca->ti, ca->origin_dev); | ||
| 1507 | |||
| 1508 | kfree(ca); | ||
| 1509 | } | ||
| 1510 | |||
| 1511 | static bool at_least_one_arg(struct dm_arg_set *as, char **error) | ||
| 1512 | { | ||
| 1513 | if (!as->argc) { | ||
| 1514 | *error = "Insufficient args"; | ||
| 1515 | return false; | ||
| 1516 | } | ||
| 1517 | |||
| 1518 | return true; | ||
| 1519 | } | ||
| 1520 | |||
| 1521 | static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, | ||
| 1522 | char **error) | ||
| 1523 | { | ||
| 1524 | int r; | ||
| 1525 | sector_t metadata_dev_size; | ||
| 1526 | char b[BDEVNAME_SIZE]; | ||
| 1527 | |||
| 1528 | if (!at_least_one_arg(as, error)) | ||
| 1529 | return -EINVAL; | ||
| 1530 | |||
| 1531 | r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, | ||
| 1532 | &ca->metadata_dev); | ||
| 1533 | if (r) { | ||
| 1534 | *error = "Error opening metadata device"; | ||
| 1535 | return r; | ||
| 1536 | } | ||
| 1537 | |||
| 1538 | metadata_dev_size = get_dev_size(ca->metadata_dev); | ||
| 1539 | if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) | ||
| 1540 | DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", | ||
| 1541 | bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); | ||
| 1542 | |||
| 1543 | return 0; | ||
| 1544 | } | ||
| 1545 | |||
| 1546 | static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, | ||
| 1547 | char **error) | ||
| 1548 | { | ||
| 1549 | int r; | ||
| 1550 | |||
| 1551 | if (!at_least_one_arg(as, error)) | ||
| 1552 | return -EINVAL; | ||
| 1553 | |||
| 1554 | r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, | ||
| 1555 | &ca->cache_dev); | ||
| 1556 | if (r) { | ||
| 1557 | *error = "Error opening cache device"; | ||
| 1558 | return r; | ||
| 1559 | } | ||
| 1560 | ca->cache_sectors = get_dev_size(ca->cache_dev); | ||
| 1561 | |||
| 1562 | return 0; | ||
| 1563 | } | ||
| 1564 | |||
| 1565 | static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, | ||
| 1566 | char **error) | ||
| 1567 | { | ||
| 1568 | int r; | ||
| 1569 | |||
| 1570 | if (!at_least_one_arg(as, error)) | ||
| 1571 | return -EINVAL; | ||
| 1572 | |||
| 1573 | r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, | ||
| 1574 | &ca->origin_dev); | ||
| 1575 | if (r) { | ||
| 1576 | *error = "Error opening origin device"; | ||
| 1577 | return r; | ||
| 1578 | } | ||
| 1579 | |||
| 1580 | ca->origin_sectors = get_dev_size(ca->origin_dev); | ||
| 1581 | if (ca->ti->len > ca->origin_sectors) { | ||
| 1582 | *error = "Device size larger than cached device"; | ||
| 1583 | return -EINVAL; | ||
| 1584 | } | ||
| 1585 | |||
| 1586 | return 0; | ||
| 1587 | } | ||
| 1588 | |||
| 1589 | static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, | ||
| 1590 | char **error) | ||
| 1591 | { | ||
| 1592 | unsigned long tmp; | ||
| 1593 | |||
| 1594 | if (!at_least_one_arg(as, error)) | ||
| 1595 | return -EINVAL; | ||
| 1596 | |||
| 1597 | if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp || | ||
| 1598 | tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || | ||
| 1599 | tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { | ||
| 1600 | *error = "Invalid data block size"; | ||
| 1601 | return -EINVAL; | ||
| 1602 | } | ||
| 1603 | |||
| 1604 | if (tmp > ca->cache_sectors) { | ||
| 1605 | *error = "Data block size is larger than the cache device"; | ||
| 1606 | return -EINVAL; | ||
| 1607 | } | ||
| 1608 | |||
| 1609 | ca->block_size = tmp; | ||
| 1610 | |||
| 1611 | return 0; | ||
| 1612 | } | ||
| 1613 | |||
| 1614 | static void init_features(struct cache_features *cf) | ||
| 1615 | { | ||
| 1616 | cf->mode = CM_WRITE; | ||
| 1617 | cf->write_through = false; | ||
| 1618 | } | ||
| 1619 | |||
| 1620 | static int parse_features(struct cache_args *ca, struct dm_arg_set *as, | ||
| 1621 | char **error) | ||
| 1622 | { | ||
| 1623 | static struct dm_arg _args[] = { | ||
| 1624 | {0, 1, "Invalid number of cache feature arguments"}, | ||
| 1625 | }; | ||
| 1626 | |||
| 1627 | int r; | ||
| 1628 | unsigned argc; | ||
| 1629 | const char *arg; | ||
| 1630 | struct cache_features *cf = &ca->features; | ||
| 1631 | |||
| 1632 | init_features(cf); | ||
| 1633 | |||
| 1634 | r = dm_read_arg_group(_args, as, &argc, error); | ||
| 1635 | if (r) | ||
| 1636 | return -EINVAL; | ||
| 1637 | |||
| 1638 | while (argc--) { | ||
| 1639 | arg = dm_shift_arg(as); | ||
| 1640 | |||
| 1641 | if (!strcasecmp(arg, "writeback")) | ||
| 1642 | cf->write_through = false; | ||
| 1643 | |||
| 1644 | else if (!strcasecmp(arg, "writethrough")) | ||
| 1645 | cf->write_through = true; | ||
| 1646 | |||
| 1647 | else { | ||
| 1648 | *error = "Unrecognised cache feature requested"; | ||
| 1649 | return -EINVAL; | ||
| 1650 | } | ||
| 1651 | } | ||
| 1652 | |||
| 1653 | return 0; | ||
| 1654 | } | ||
| 1655 | |||
| 1656 | static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, | ||
| 1657 | char **error) | ||
| 1658 | { | ||
| 1659 | static struct dm_arg _args[] = { | ||
| 1660 | {0, 1024, "Invalid number of policy arguments"}, | ||
| 1661 | }; | ||
| 1662 | |||
| 1663 | int r; | ||
| 1664 | |||
| 1665 | if (!at_least_one_arg(as, error)) | ||
| 1666 | return -EINVAL; | ||
| 1667 | |||
| 1668 | ca->policy_name = dm_shift_arg(as); | ||
| 1669 | |||
| 1670 | r = dm_read_arg_group(_args, as, &ca->policy_argc, error); | ||
| 1671 | if (r) | ||
| 1672 | return -EINVAL; | ||
| 1673 | |||
| 1674 | ca->policy_argv = (const char **)as->argv; | ||
| 1675 | dm_consume_args(as, ca->policy_argc); | ||
| 1676 | |||
| 1677 | return 0; | ||
| 1678 | } | ||
| 1679 | |||
| 1680 | static int parse_cache_args(struct cache_args *ca, int argc, char **argv, | ||
| 1681 | char **error) | ||
| 1682 | { | ||
| 1683 | int r; | ||
| 1684 | struct dm_arg_set as; | ||
| 1685 | |||
| 1686 | as.argc = argc; | ||
| 1687 | as.argv = argv; | ||
| 1688 | |||
| 1689 | r = parse_metadata_dev(ca, &as, error); | ||
| 1690 | if (r) | ||
| 1691 | return r; | ||
| 1692 | |||
| 1693 | r = parse_cache_dev(ca, &as, error); | ||
| 1694 | if (r) | ||
| 1695 | return r; | ||
| 1696 | |||
| 1697 | r = parse_origin_dev(ca, &as, error); | ||
| 1698 | if (r) | ||
| 1699 | return r; | ||
| 1700 | |||
| 1701 | r = parse_block_size(ca, &as, error); | ||
| 1702 | if (r) | ||
| 1703 | return r; | ||
| 1704 | |||
| 1705 | r = parse_features(ca, &as, error); | ||
| 1706 | if (r) | ||
| 1707 | return r; | ||
| 1708 | |||
| 1709 | r = parse_policy(ca, &as, error); | ||
| 1710 | if (r) | ||
| 1711 | return r; | ||
| 1712 | |||
| 1713 | return 0; | ||
| 1714 | } | ||
| 1715 | |||
| 1716 | /*----------------------------------------------------------------*/ | ||
| 1717 | |||
| 1718 | static struct kmem_cache *migration_cache; | ||
| 1719 | |||
| 1720 | static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv) | ||
| 1721 | { | ||
| 1722 | int r = 0; | ||
| 1723 | |||
| 1724 | if (argc & 1) { | ||
| 1725 | DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); | ||
| 1726 | return -EINVAL; | ||
| 1727 | } | ||
| 1728 | |||
| 1729 | while (argc) { | ||
| 1730 | r = policy_set_config_value(p, argv[0], argv[1]); | ||
| 1731 | if (r) { | ||
| 1732 | DMWARN("policy_set_config_value failed: key = '%s', value = '%s'", | ||
| 1733 | argv[0], argv[1]); | ||
| 1734 | return r; | ||
| 1735 | } | ||
| 1736 | |||
| 1737 | argc -= 2; | ||
| 1738 | argv += 2; | ||
| 1739 | } | ||
| 1740 | |||
| 1741 | return r; | ||
| 1742 | } | ||
| 1743 | |||
| 1744 | static int create_cache_policy(struct cache *cache, struct cache_args *ca, | ||
| 1745 | char **error) | ||
| 1746 | { | ||
| 1747 | int r; | ||
| 1748 | |||
| 1749 | cache->policy = dm_cache_policy_create(ca->policy_name, | ||
| 1750 | cache->cache_size, | ||
| 1751 | cache->origin_sectors, | ||
| 1752 | cache->sectors_per_block); | ||
| 1753 | if (!cache->policy) { | ||
| 1754 | *error = "Error creating cache's policy"; | ||
| 1755 | return -ENOMEM; | ||
| 1756 | } | ||
| 1757 | |||
| 1758 | r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv); | ||
| 1759 | if (r) | ||
| 1760 | dm_cache_policy_destroy(cache->policy); | ||
| 1761 | |||
| 1762 | return r; | ||
| 1763 | } | ||
| 1764 | |||
| 1765 | /* | ||
| 1766 | * We want the discard block size to be a power of two, at least the size | ||
| 1767 | * of the cache block size, and have no more than 2^14 discard blocks | ||
| 1768 | * across the origin. | ||
| 1769 | */ | ||
| 1770 | #define MAX_DISCARD_BLOCKS (1 << 14) | ||
| 1771 | |||
| 1772 | static bool too_many_discard_blocks(sector_t discard_block_size, | ||
| 1773 | sector_t origin_size) | ||
| 1774 | { | ||
| 1775 | (void) sector_div(origin_size, discard_block_size); | ||
| 1776 | |||
| 1777 | return origin_size > MAX_DISCARD_BLOCKS; | ||
| 1778 | } | ||
| 1779 | |||
| 1780 | static sector_t calculate_discard_block_size(sector_t cache_block_size, | ||
| 1781 | sector_t origin_size) | ||
| 1782 | { | ||
| 1783 | sector_t discard_block_size; | ||
| 1784 | |||
| 1785 | discard_block_size = roundup_pow_of_two(cache_block_size); | ||
| 1786 | |||
| 1787 | if (origin_size) | ||
| 1788 | while (too_many_discard_blocks(discard_block_size, origin_size)) | ||
| 1789 | discard_block_size *= 2; | ||
| 1790 | |||
| 1791 | return discard_block_size; | ||
| 1792 | } | ||
| 1793 | |||
| 1794 | #define DEFAULT_MIGRATION_THRESHOLD (2048 * 100) | ||
| 1795 | |||
| 1796 | static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio); | ||
| 1797 | |||
| 1798 | static int cache_create(struct cache_args *ca, struct cache **result) | ||
| 1799 | { | ||
| 1800 | int r = 0; | ||
| 1801 | char **error = &ca->ti->error; | ||
| 1802 | struct cache *cache; | ||
| 1803 | struct dm_target *ti = ca->ti; | ||
| 1804 | dm_block_t origin_blocks; | ||
| 1805 | struct dm_cache_metadata *cmd; | ||
| 1806 | bool may_format = ca->features.mode == CM_WRITE; | ||
| 1807 | |||
| 1808 | cache = kzalloc(sizeof(*cache), GFP_KERNEL); | ||
| 1809 | if (!cache) | ||
| 1810 | return -ENOMEM; | ||
| 1811 | |||
| 1812 | cache->ti = ca->ti; | ||
| 1813 | ti->private = cache; | ||
| 1814 | ti->per_bio_data_size = sizeof(struct per_bio_data); | ||
| 1815 | ti->num_flush_bios = 2; | ||
| 1816 | ti->flush_supported = true; | ||
| 1817 | |||
| 1818 | ti->num_discard_bios = 1; | ||
| 1819 | ti->discards_supported = true; | ||
| 1820 | ti->discard_zeroes_data_unsupported = true; | ||
| 1821 | |||
| 1822 | memcpy(&cache->features, &ca->features, sizeof(cache->features)); | ||
| 1823 | |||
| 1824 | if (cache->features.write_through) | ||
| 1825 | ti->num_write_bios = cache_num_write_bios; | ||
| 1826 | |||
| 1827 | cache->callbacks.congested_fn = cache_is_congested; | ||
| 1828 | dm_table_add_target_callbacks(ti->table, &cache->callbacks); | ||
| 1829 | |||
| 1830 | cache->metadata_dev = ca->metadata_dev; | ||
| 1831 | cache->origin_dev = ca->origin_dev; | ||
| 1832 | cache->cache_dev = ca->cache_dev; | ||
| 1833 | |||
| 1834 | ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; | ||
| 1835 | |||
| 1836 | /* FIXME: factor out this whole section */ | ||
| 1837 | origin_blocks = cache->origin_sectors = ca->origin_sectors; | ||
| 1838 | (void) sector_div(origin_blocks, ca->block_size); | ||
| 1839 | cache->origin_blocks = to_oblock(origin_blocks); | ||
| 1840 | |||
| 1841 | cache->sectors_per_block = ca->block_size; | ||
| 1842 | if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { | ||
| 1843 | r = -EINVAL; | ||
| 1844 | goto bad; | ||
| 1845 | } | ||
| 1846 | |||
| 1847 | if (ca->block_size & (ca->block_size - 1)) { | ||
| 1848 | dm_block_t cache_size = ca->cache_sectors; | ||
| 1849 | |||
| 1850 | cache->sectors_per_block_shift = -1; | ||
| 1851 | (void) sector_div(cache_size, ca->block_size); | ||
| 1852 | cache->cache_size = to_cblock(cache_size); | ||
| 1853 | } else { | ||
| 1854 | cache->sectors_per_block_shift = __ffs(ca->block_size); | ||
| 1855 | cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift); | ||
| 1856 | } | ||
| 1857 | |||
| 1858 | r = create_cache_policy(cache, ca, error); | ||
| 1859 | if (r) | ||
| 1860 | goto bad; | ||
| 1861 | cache->policy_nr_args = ca->policy_argc; | ||
| 1862 | |||
| 1863 | cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, | ||
| 1864 | ca->block_size, may_format, | ||
| 1865 | dm_cache_policy_get_hint_size(cache->policy)); | ||
| 1866 | if (IS_ERR(cmd)) { | ||
| 1867 | *error = "Error creating metadata object"; | ||
| 1868 | r = PTR_ERR(cmd); | ||
| 1869 | goto bad; | ||
| 1870 | } | ||
| 1871 | cache->cmd = cmd; | ||
| 1872 | |||
| 1873 | spin_lock_init(&cache->lock); | ||
| 1874 | bio_list_init(&cache->deferred_bios); | ||
| 1875 | bio_list_init(&cache->deferred_flush_bios); | ||
| 1876 | INIT_LIST_HEAD(&cache->quiesced_migrations); | ||
| 1877 | INIT_LIST_HEAD(&cache->completed_migrations); | ||
| 1878 | INIT_LIST_HEAD(&cache->need_commit_migrations); | ||
| 1879 | cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; | ||
| 1880 | atomic_set(&cache->nr_migrations, 0); | ||
| 1881 | init_waitqueue_head(&cache->migration_wait); | ||
| 1882 | |||
| 1883 | cache->nr_dirty = 0; | ||
| 1884 | cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); | ||
| 1885 | if (!cache->dirty_bitset) { | ||
| 1886 | *error = "could not allocate dirty bitset"; | ||
| 1887 | goto bad; | ||
| 1888 | } | ||
| 1889 | clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); | ||
| 1890 | |||
| 1891 | cache->discard_block_size = | ||
| 1892 | calculate_discard_block_size(cache->sectors_per_block, | ||
| 1893 | cache->origin_sectors); | ||
| 1894 | cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks); | ||
| 1895 | cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); | ||
| 1896 | if (!cache->discard_bitset) { | ||
| 1897 | *error = "could not allocate discard bitset"; | ||
| 1898 | goto bad; | ||
| 1899 | } | ||
| 1900 | clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); | ||
| 1901 | |||
| 1902 | cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); | ||
| 1903 | if (IS_ERR(cache->copier)) { | ||
| 1904 | *error = "could not create kcopyd client"; | ||
| 1905 | r = PTR_ERR(cache->copier); | ||
| 1906 | goto bad; | ||
| 1907 | } | ||
| 1908 | |||
| 1909 | cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); | ||
| 1910 | if (!cache->wq) { | ||
| 1911 | *error = "could not create workqueue for metadata object"; | ||
| 1912 | goto bad; | ||
| 1913 | } | ||
| 1914 | INIT_WORK(&cache->worker, do_worker); | ||
| 1915 | INIT_DELAYED_WORK(&cache->waker, do_waker); | ||
| 1916 | cache->last_commit_jiffies = jiffies; | ||
| 1917 | |||
| 1918 | cache->prison = dm_bio_prison_create(PRISON_CELLS); | ||
| 1919 | if (!cache->prison) { | ||
| 1920 | *error = "could not create bio prison"; | ||
| 1921 | goto bad; | ||
| 1922 | } | ||
| 1923 | |||
| 1924 | cache->all_io_ds = dm_deferred_set_create(); | ||
| 1925 | if (!cache->all_io_ds) { | ||
| 1926 | *error = "could not create all_io deferred set"; | ||
| 1927 | goto bad; | ||
| 1928 | } | ||
| 1929 | |||
| 1930 | cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, | ||
| 1931 | migration_cache); | ||
| 1932 | if (!cache->migration_pool) { | ||
| 1933 | *error = "Error creating cache's migration mempool"; | ||
| 1934 | goto bad; | ||
| 1935 | } | ||
| 1936 | |||
| 1937 | cache->next_migration = NULL; | ||
| 1938 | |||
| 1939 | cache->need_tick_bio = true; | ||
| 1940 | cache->sized = false; | ||
| 1941 | cache->quiescing = false; | ||
| 1942 | cache->commit_requested = false; | ||
| 1943 | cache->loaded_mappings = false; | ||
| 1944 | cache->loaded_discards = false; | ||
| 1945 | |||
| 1946 | load_stats(cache); | ||
| 1947 | |||
| 1948 | atomic_set(&cache->stats.demotion, 0); | ||
| 1949 | atomic_set(&cache->stats.promotion, 0); | ||
| 1950 | atomic_set(&cache->stats.copies_avoided, 0); | ||
| 1951 | atomic_set(&cache->stats.cache_cell_clash, 0); | ||
| 1952 | atomic_set(&cache->stats.commit_count, 0); | ||
| 1953 | atomic_set(&cache->stats.discard_count, 0); | ||
| 1954 | |||
| 1955 | *result = cache; | ||
| 1956 | return 0; | ||
| 1957 | |||
| 1958 | bad: | ||
| 1959 | destroy(cache); | ||
| 1960 | return r; | ||
| 1961 | } | ||
| 1962 | |||
| 1963 | static int copy_ctr_args(struct cache *cache, int argc, const char **argv) | ||
| 1964 | { | ||
| 1965 | unsigned i; | ||
| 1966 | const char **copy; | ||
| 1967 | |||
| 1968 | copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); | ||
| 1969 | if (!copy) | ||
| 1970 | return -ENOMEM; | ||
| 1971 | for (i = 0; i < argc; i++) { | ||
| 1972 | copy[i] = kstrdup(argv[i], GFP_KERNEL); | ||
| 1973 | if (!copy[i]) { | ||
| 1974 | while (i--) | ||
| 1975 | kfree(copy[i]); | ||
| 1976 | kfree(copy); | ||
| 1977 | return -ENOMEM; | ||
| 1978 | } | ||
| 1979 | } | ||
| 1980 | |||
| 1981 | cache->nr_ctr_args = argc; | ||
| 1982 | cache->ctr_args = copy; | ||
| 1983 | |||
| 1984 | return 0; | ||
| 1985 | } | ||
| 1986 | |||
| 1987 | static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) | ||
| 1988 | { | ||
| 1989 | int r = -EINVAL; | ||
| 1990 | struct cache_args *ca; | ||
| 1991 | struct cache *cache = NULL; | ||
| 1992 | |||
| 1993 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
| 1994 | if (!ca) { | ||
| 1995 | ti->error = "Error allocating memory for cache"; | ||
| 1996 | return -ENOMEM; | ||
| 1997 | } | ||
| 1998 | ca->ti = ti; | ||
| 1999 | |||
| 2000 | r = parse_cache_args(ca, argc, argv, &ti->error); | ||
| 2001 | if (r) | ||
| 2002 | goto out; | ||
| 2003 | |||
| 2004 | r = cache_create(ca, &cache); | ||
| 2005 | |||
| 2006 | r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); | ||
| 2007 | if (r) { | ||
| 2008 | destroy(cache); | ||
| 2009 | goto out; | ||
| 2010 | } | ||
| 2011 | |||
| 2012 | ti->private = cache; | ||
| 2013 | |||
| 2014 | out: | ||
| 2015 | destroy_cache_args(ca); | ||
| 2016 | return r; | ||
| 2017 | } | ||
| 2018 | |||
| 2019 | static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio) | ||
| 2020 | { | ||
| 2021 | int r; | ||
| 2022 | struct cache *cache = ti->private; | ||
| 2023 | dm_oblock_t block = get_bio_block(cache, bio); | ||
| 2024 | dm_cblock_t cblock; | ||
| 2025 | |||
| 2026 | r = policy_lookup(cache->policy, block, &cblock); | ||
| 2027 | if (r < 0) | ||
| 2028 | return 2; /* assume the worst */ | ||
| 2029 | |||
| 2030 | return (!r && !is_dirty(cache, cblock)) ? 2 : 1; | ||
| 2031 | } | ||
| 2032 | |||
| 2033 | static int cache_map(struct dm_target *ti, struct bio *bio) | ||
| 2034 | { | ||
| 2035 | struct cache *cache = ti->private; | ||
| 2036 | |||
| 2037 | int r; | ||
| 2038 | dm_oblock_t block = get_bio_block(cache, bio); | ||
| 2039 | bool can_migrate = false; | ||
| 2040 | bool discarded_block; | ||
| 2041 | struct dm_bio_prison_cell *cell; | ||
| 2042 | struct policy_result lookup_result; | ||
| 2043 | struct per_bio_data *pb; | ||
| 2044 | |||
| 2045 | if (from_oblock(block) > from_oblock(cache->origin_blocks)) { | ||
| 2046 | /* | ||
| 2047 | * This can only occur if the io goes to a partial block at | ||
| 2048 | * the end of the origin device. We don't cache these. | ||
| 2049 | * Just remap to the origin and carry on. | ||
| 2050 | */ | ||
| 2051 | remap_to_origin_clear_discard(cache, bio, block); | ||
| 2052 | return DM_MAPIO_REMAPPED; | ||
| 2053 | } | ||
| 2054 | |||
| 2055 | pb = init_per_bio_data(bio); | ||
| 2056 | |||
| 2057 | if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) { | ||
| 2058 | defer_bio(cache, bio); | ||
| 2059 | return DM_MAPIO_SUBMITTED; | ||
| 2060 | } | ||
| 2061 | |||
| 2062 | /* | ||
| 2063 | * Check to see if that block is currently migrating. | ||
| 2064 | */ | ||
| 2065 | cell = alloc_prison_cell(cache); | ||
| 2066 | if (!cell) { | ||
| 2067 | defer_bio(cache, bio); | ||
| 2068 | return DM_MAPIO_SUBMITTED; | ||
| 2069 | } | ||
| 2070 | |||
| 2071 | r = bio_detain(cache, block, bio, cell, | ||
| 2072 | (cell_free_fn) free_prison_cell, | ||
| 2073 | cache, &cell); | ||
| 2074 | if (r) { | ||
| 2075 | if (r < 0) | ||
| 2076 | defer_bio(cache, bio); | ||
| 2077 | |||
| 2078 | return DM_MAPIO_SUBMITTED; | ||
| 2079 | } | ||
| 2080 | |||
| 2081 | discarded_block = is_discarded_oblock(cache, block); | ||
| 2082 | |||
| 2083 | r = policy_map(cache->policy, block, false, can_migrate, discarded_block, | ||
| 2084 | bio, &lookup_result); | ||
| 2085 | if (r == -EWOULDBLOCK) { | ||
| 2086 | cell_defer(cache, cell, true); | ||
| 2087 | return DM_MAPIO_SUBMITTED; | ||
| 2088 | |||
| 2089 | } else if (r) { | ||
| 2090 | DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r); | ||
| 2091 | bio_io_error(bio); | ||
| 2092 | return DM_MAPIO_SUBMITTED; | ||
| 2093 | } | ||
| 2094 | |||
| 2095 | switch (lookup_result.op) { | ||
| 2096 | case POLICY_HIT: | ||
| 2097 | inc_hit_counter(cache, bio); | ||
| 2098 | pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); | ||
| 2099 | |||
| 2100 | if (is_writethrough_io(cache, bio, lookup_result.cblock)) { | ||
| 2101 | /* | ||
| 2102 | * No need to mark anything dirty in write through mode. | ||
| 2103 | */ | ||
| 2104 | pb->req_nr == 0 ? | ||
| 2105 | remap_to_cache(cache, bio, lookup_result.cblock) : | ||
| 2106 | remap_to_origin_clear_discard(cache, bio, block); | ||
| 2107 | cell_defer(cache, cell, false); | ||
| 2108 | } else { | ||
| 2109 | remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); | ||
| 2110 | cell_defer(cache, cell, false); | ||
| 2111 | } | ||
| 2112 | break; | ||
| 2113 | |||
| 2114 | case POLICY_MISS: | ||
| 2115 | inc_miss_counter(cache, bio); | ||
| 2116 | pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); | ||
| 2117 | |||
| 2118 | if (pb->req_nr != 0) { | ||
| 2119 | /* | ||
| 2120 | * This is a duplicate writethrough io that is no | ||
| 2121 | * longer needed because the block has been demoted. | ||
| 2122 | */ | ||
| 2123 | bio_endio(bio, 0); | ||
| 2124 | cell_defer(cache, cell, false); | ||
| 2125 | return DM_MAPIO_SUBMITTED; | ||
| 2126 | } else { | ||
| 2127 | remap_to_origin_clear_discard(cache, bio, block); | ||
| 2128 | cell_defer(cache, cell, false); | ||
| 2129 | } | ||
| 2130 | break; | ||
| 2131 | |||
| 2132 | default: | ||
| 2133 | DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, | ||
| 2134 | (unsigned) lookup_result.op); | ||
| 2135 | bio_io_error(bio); | ||
| 2136 | return DM_MAPIO_SUBMITTED; | ||
| 2137 | } | ||
| 2138 | |||
| 2139 | return DM_MAPIO_REMAPPED; | ||
| 2140 | } | ||
| 2141 | |||
| 2142 | static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) | ||
| 2143 | { | ||
| 2144 | struct cache *cache = ti->private; | ||
| 2145 | unsigned long flags; | ||
| 2146 | struct per_bio_data *pb = get_per_bio_data(bio); | ||
| 2147 | |||
| 2148 | if (pb->tick) { | ||
| 2149 | policy_tick(cache->policy); | ||
| 2150 | |||
| 2151 | spin_lock_irqsave(&cache->lock, flags); | ||
| 2152 | cache->need_tick_bio = true; | ||
| 2153 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 2154 | } | ||
| 2155 | |||
| 2156 | check_for_quiesced_migrations(cache, pb); | ||
| 2157 | |||
| 2158 | return 0; | ||
| 2159 | } | ||
| 2160 | |||
| 2161 | static int write_dirty_bitset(struct cache *cache) | ||
| 2162 | { | ||
| 2163 | unsigned i, r; | ||
| 2164 | |||
| 2165 | for (i = 0; i < from_cblock(cache->cache_size); i++) { | ||
| 2166 | r = dm_cache_set_dirty(cache->cmd, to_cblock(i), | ||
| 2167 | is_dirty(cache, to_cblock(i))); | ||
| 2168 | if (r) | ||
| 2169 | return r; | ||
| 2170 | } | ||
| 2171 | |||
| 2172 | return 0; | ||
| 2173 | } | ||
| 2174 | |||
| 2175 | static int write_discard_bitset(struct cache *cache) | ||
| 2176 | { | ||
| 2177 | unsigned i, r; | ||
| 2178 | |||
| 2179 | r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, | ||
| 2180 | cache->discard_nr_blocks); | ||
| 2181 | if (r) { | ||
| 2182 | DMERR("could not resize on-disk discard bitset"); | ||
| 2183 | return r; | ||
| 2184 | } | ||
| 2185 | |||
| 2186 | for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { | ||
| 2187 | r = dm_cache_set_discard(cache->cmd, to_dblock(i), | ||
| 2188 | is_discarded(cache, to_dblock(i))); | ||
| 2189 | if (r) | ||
| 2190 | return r; | ||
| 2191 | } | ||
| 2192 | |||
| 2193 | return 0; | ||
| 2194 | } | ||
| 2195 | |||
| 2196 | static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock, | ||
| 2197 | uint32_t hint) | ||
| 2198 | { | ||
| 2199 | struct cache *cache = context; | ||
| 2200 | return dm_cache_save_hint(cache->cmd, cblock, hint); | ||
| 2201 | } | ||
| 2202 | |||
| 2203 | static int write_hints(struct cache *cache) | ||
| 2204 | { | ||
| 2205 | int r; | ||
| 2206 | |||
| 2207 | r = dm_cache_begin_hints(cache->cmd, cache->policy); | ||
| 2208 | if (r) { | ||
| 2209 | DMERR("dm_cache_begin_hints failed"); | ||
| 2210 | return r; | ||
| 2211 | } | ||
| 2212 | |||
| 2213 | r = policy_walk_mappings(cache->policy, save_hint, cache); | ||
| 2214 | if (r) | ||
| 2215 | DMERR("policy_walk_mappings failed"); | ||
| 2216 | |||
| 2217 | return r; | ||
| 2218 | } | ||
| 2219 | |||
| 2220 | /* | ||
| 2221 | * returns true on success | ||
| 2222 | */ | ||
| 2223 | static bool sync_metadata(struct cache *cache) | ||
| 2224 | { | ||
| 2225 | int r1, r2, r3, r4; | ||
| 2226 | |||
| 2227 | r1 = write_dirty_bitset(cache); | ||
| 2228 | if (r1) | ||
| 2229 | DMERR("could not write dirty bitset"); | ||
| 2230 | |||
| 2231 | r2 = write_discard_bitset(cache); | ||
| 2232 | if (r2) | ||
| 2233 | DMERR("could not write discard bitset"); | ||
| 2234 | |||
| 2235 | save_stats(cache); | ||
| 2236 | |||
| 2237 | r3 = write_hints(cache); | ||
| 2238 | if (r3) | ||
| 2239 | DMERR("could not write hints"); | ||
| 2240 | |||
| 2241 | /* | ||
| 2242 | * If writing the above metadata failed, we still commit, but don't | ||
| 2243 | * set the clean shutdown flag. This will effectively force every | ||
| 2244 | * dirty bit to be set on reload. | ||
| 2245 | */ | ||
| 2246 | r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3); | ||
| 2247 | if (r4) | ||
| 2248 | DMERR("could not write cache metadata. Data loss may occur."); | ||
| 2249 | |||
| 2250 | return !r1 && !r2 && !r3 && !r4; | ||
| 2251 | } | ||
| 2252 | |||
| 2253 | static void cache_postsuspend(struct dm_target *ti) | ||
| 2254 | { | ||
| 2255 | struct cache *cache = ti->private; | ||
| 2256 | |||
| 2257 | start_quiescing(cache); | ||
| 2258 | wait_for_migrations(cache); | ||
| 2259 | stop_worker(cache); | ||
| 2260 | requeue_deferred_io(cache); | ||
| 2261 | stop_quiescing(cache); | ||
| 2262 | |||
| 2263 | (void) sync_metadata(cache); | ||
| 2264 | } | ||
| 2265 | |||
| 2266 | static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, | ||
| 2267 | bool dirty, uint32_t hint, bool hint_valid) | ||
| 2268 | { | ||
| 2269 | int r; | ||
| 2270 | struct cache *cache = context; | ||
| 2271 | |||
| 2272 | r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); | ||
| 2273 | if (r) | ||
| 2274 | return r; | ||
| 2275 | |||
| 2276 | if (dirty) | ||
| 2277 | set_dirty(cache, oblock, cblock); | ||
| 2278 | else | ||
| 2279 | clear_dirty(cache, oblock, cblock); | ||
| 2280 | |||
| 2281 | return 0; | ||
| 2282 | } | ||
| 2283 | |||
| 2284 | static int load_discard(void *context, sector_t discard_block_size, | ||
| 2285 | dm_dblock_t dblock, bool discard) | ||
| 2286 | { | ||
| 2287 | struct cache *cache = context; | ||
| 2288 | |||
| 2289 | /* FIXME: handle mis-matched block size */ | ||
| 2290 | |||
| 2291 | if (discard) | ||
| 2292 | set_discard(cache, dblock); | ||
| 2293 | else | ||
| 2294 | clear_discard(cache, dblock); | ||
| 2295 | |||
| 2296 | return 0; | ||
| 2297 | } | ||
| 2298 | |||
| 2299 | static int cache_preresume(struct dm_target *ti) | ||
| 2300 | { | ||
| 2301 | int r = 0; | ||
| 2302 | struct cache *cache = ti->private; | ||
| 2303 | sector_t actual_cache_size = get_dev_size(cache->cache_dev); | ||
| 2304 | (void) sector_div(actual_cache_size, cache->sectors_per_block); | ||
| 2305 | |||
| 2306 | /* | ||
| 2307 | * Check to see if the cache has resized. | ||
| 2308 | */ | ||
| 2309 | if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) { | ||
| 2310 | cache->cache_size = to_cblock(actual_cache_size); | ||
| 2311 | |||
| 2312 | r = dm_cache_resize(cache->cmd, cache->cache_size); | ||
| 2313 | if (r) { | ||
| 2314 | DMERR("could not resize cache metadata"); | ||
| 2315 | return r; | ||
| 2316 | } | ||
| 2317 | |||
| 2318 | cache->sized = true; | ||
| 2319 | } | ||
| 2320 | |||
| 2321 | if (!cache->loaded_mappings) { | ||
| 2322 | r = dm_cache_load_mappings(cache->cmd, | ||
| 2323 | dm_cache_policy_get_name(cache->policy), | ||
| 2324 | load_mapping, cache); | ||
| 2325 | if (r) { | ||
| 2326 | DMERR("could not load cache mappings"); | ||
| 2327 | return r; | ||
| 2328 | } | ||
| 2329 | |||
| 2330 | cache->loaded_mappings = true; | ||
| 2331 | } | ||
| 2332 | |||
| 2333 | if (!cache->loaded_discards) { | ||
| 2334 | r = dm_cache_load_discards(cache->cmd, load_discard, cache); | ||
| 2335 | if (r) { | ||
| 2336 | DMERR("could not load origin discards"); | ||
| 2337 | return r; | ||
| 2338 | } | ||
| 2339 | |||
| 2340 | cache->loaded_discards = true; | ||
| 2341 | } | ||
| 2342 | |||
| 2343 | return r; | ||
| 2344 | } | ||
| 2345 | |||
| 2346 | static void cache_resume(struct dm_target *ti) | ||
| 2347 | { | ||
| 2348 | struct cache *cache = ti->private; | ||
| 2349 | |||
| 2350 | cache->need_tick_bio = true; | ||
| 2351 | do_waker(&cache->waker.work); | ||
| 2352 | } | ||
| 2353 | |||
| 2354 | /* | ||
| 2355 | * Status format: | ||
| 2356 | * | ||
| 2357 | * <#used metadata blocks>/<#total metadata blocks> | ||
| 2358 | * <#read hits> <#read misses> <#write hits> <#write misses> | ||
| 2359 | * <#demotions> <#promotions> <#blocks in cache> <#dirty> | ||
| 2360 | * <#features> <features>* | ||
| 2361 | * <#core args> <core args> | ||
| 2362 | * <#policy args> <policy args>* | ||
| 2363 | */ | ||
| 2364 | static void cache_status(struct dm_target *ti, status_type_t type, | ||
| 2365 | unsigned status_flags, char *result, unsigned maxlen) | ||
| 2366 | { | ||
| 2367 | int r = 0; | ||
| 2368 | unsigned i; | ||
| 2369 | ssize_t sz = 0; | ||
| 2370 | dm_block_t nr_free_blocks_metadata = 0; | ||
| 2371 | dm_block_t nr_blocks_metadata = 0; | ||
| 2372 | char buf[BDEVNAME_SIZE]; | ||
| 2373 | struct cache *cache = ti->private; | ||
| 2374 | dm_cblock_t residency; | ||
| 2375 | |||
| 2376 | switch (type) { | ||
| 2377 | case STATUSTYPE_INFO: | ||
| 2378 | /* Commit to ensure statistics aren't out-of-date */ | ||
| 2379 | if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) { | ||
| 2380 | r = dm_cache_commit(cache->cmd, false); | ||
| 2381 | if (r) | ||
| 2382 | DMERR("could not commit metadata for accurate status"); | ||
| 2383 | } | ||
| 2384 | |||
| 2385 | r = dm_cache_get_free_metadata_block_count(cache->cmd, | ||
| 2386 | &nr_free_blocks_metadata); | ||
| 2387 | if (r) { | ||
| 2388 | DMERR("could not get metadata free block count"); | ||
| 2389 | goto err; | ||
| 2390 | } | ||
| 2391 | |||
| 2392 | r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); | ||
| 2393 | if (r) { | ||
| 2394 | DMERR("could not get metadata device size"); | ||
| 2395 | goto err; | ||
| 2396 | } | ||
| 2397 | |||
| 2398 | residency = policy_residency(cache->policy); | ||
| 2399 | |||
| 2400 | DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ", | ||
| 2401 | (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), | ||
| 2402 | (unsigned long long)nr_blocks_metadata, | ||
| 2403 | (unsigned) atomic_read(&cache->stats.read_hit), | ||
| 2404 | (unsigned) atomic_read(&cache->stats.read_miss), | ||
| 2405 | (unsigned) atomic_read(&cache->stats.write_hit), | ||
| 2406 | (unsigned) atomic_read(&cache->stats.write_miss), | ||
| 2407 | (unsigned) atomic_read(&cache->stats.demotion), | ||
| 2408 | (unsigned) atomic_read(&cache->stats.promotion), | ||
| 2409 | (unsigned long long) from_cblock(residency), | ||
| 2410 | cache->nr_dirty); | ||
| 2411 | |||
| 2412 | if (cache->features.write_through) | ||
| 2413 | DMEMIT("1 writethrough "); | ||
| 2414 | else | ||
| 2415 | DMEMIT("0 "); | ||
| 2416 | |||
| 2417 | DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); | ||
| 2418 | if (sz < maxlen) { | ||
| 2419 | r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz); | ||
| 2420 | if (r) | ||
| 2421 | DMERR("policy_emit_config_values returned %d", r); | ||
| 2422 | } | ||
| 2423 | |||
| 2424 | break; | ||
| 2425 | |||
| 2426 | case STATUSTYPE_TABLE: | ||
| 2427 | format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); | ||
| 2428 | DMEMIT("%s ", buf); | ||
| 2429 | format_dev_t(buf, cache->cache_dev->bdev->bd_dev); | ||
| 2430 | DMEMIT("%s ", buf); | ||
| 2431 | format_dev_t(buf, cache->origin_dev->bdev->bd_dev); | ||
| 2432 | DMEMIT("%s", buf); | ||
| 2433 | |||
| 2434 | for (i = 0; i < cache->nr_ctr_args - 1; i++) | ||
| 2435 | DMEMIT(" %s", cache->ctr_args[i]); | ||
| 2436 | if (cache->nr_ctr_args) | ||
| 2437 | DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); | ||
| 2438 | } | ||
| 2439 | |||
| 2440 | return; | ||
| 2441 | |||
| 2442 | err: | ||
| 2443 | DMEMIT("Error"); | ||
| 2444 | } | ||
| 2445 | |||
| 2446 | #define NOT_CORE_OPTION 1 | ||
| 2447 | |||
| 2448 | static int process_config_option(struct cache *cache, char **argv) | ||
| 2449 | { | ||
| 2450 | unsigned long tmp; | ||
| 2451 | |||
| 2452 | if (!strcasecmp(argv[0], "migration_threshold")) { | ||
| 2453 | if (kstrtoul(argv[1], 10, &tmp)) | ||
| 2454 | return -EINVAL; | ||
| 2455 | |||
| 2456 | cache->migration_threshold = tmp; | ||
| 2457 | return 0; | ||
| 2458 | } | ||
| 2459 | |||
| 2460 | return NOT_CORE_OPTION; | ||
| 2461 | } | ||
| 2462 | |||
| 2463 | /* | ||
| 2464 | * Supports <key> <value>. | ||
| 2465 | * | ||
| 2466 | * The key migration_threshold is supported by the cache target core. | ||
| 2467 | */ | ||
| 2468 | static int cache_message(struct dm_target *ti, unsigned argc, char **argv) | ||
| 2469 | { | ||
| 2470 | int r; | ||
| 2471 | struct cache *cache = ti->private; | ||
| 2472 | |||
| 2473 | if (argc != 2) | ||
| 2474 | return -EINVAL; | ||
| 2475 | |||
| 2476 | r = process_config_option(cache, argv); | ||
| 2477 | if (r == NOT_CORE_OPTION) | ||
| 2478 | return policy_set_config_value(cache->policy, argv[0], argv[1]); | ||
| 2479 | |||
| 2480 | return r; | ||
| 2481 | } | ||
| 2482 | |||
| 2483 | static int cache_iterate_devices(struct dm_target *ti, | ||
| 2484 | iterate_devices_callout_fn fn, void *data) | ||
| 2485 | { | ||
| 2486 | int r = 0; | ||
| 2487 | struct cache *cache = ti->private; | ||
| 2488 | |||
| 2489 | r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); | ||
| 2490 | if (!r) | ||
| 2491 | r = fn(ti, cache->origin_dev, 0, ti->len, data); | ||
| 2492 | |||
| 2493 | return r; | ||
| 2494 | } | ||
| 2495 | |||
| 2496 | /* | ||
| 2497 | * We assume I/O is going to the origin (which is the volume | ||
| 2498 | * more likely to have restrictions e.g. by being striped). | ||
| 2499 | * (Looking up the exact location of the data would be expensive | ||
| 2500 | * and could always be out of date by the time the bio is submitted.) | ||
| 2501 | */ | ||
| 2502 | static int cache_bvec_merge(struct dm_target *ti, | ||
| 2503 | struct bvec_merge_data *bvm, | ||
| 2504 | struct bio_vec *biovec, int max_size) | ||
| 2505 | { | ||
| 2506 | struct cache *cache = ti->private; | ||
| 2507 | struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev); | ||
| 2508 | |||
| 2509 | if (!q->merge_bvec_fn) | ||
| 2510 | return max_size; | ||
| 2511 | |||
| 2512 | bvm->bi_bdev = cache->origin_dev->bdev; | ||
| 2513 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | ||
| 2514 | } | ||
| 2515 | |||
| 2516 | static void set_discard_limits(struct cache *cache, struct queue_limits *limits) | ||
| 2517 | { | ||
| 2518 | /* | ||
| 2519 | * FIXME: these limits may be incompatible with the cache device | ||
| 2520 | */ | ||
| 2521 | limits->max_discard_sectors = cache->discard_block_size * 1024; | ||
| 2522 | limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; | ||
| 2523 | } | ||
| 2524 | |||
| 2525 | static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) | ||
| 2526 | { | ||
| 2527 | struct cache *cache = ti->private; | ||
| 2528 | |||
| 2529 | blk_limits_io_min(limits, 0); | ||
| 2530 | blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); | ||
| 2531 | set_discard_limits(cache, limits); | ||
| 2532 | } | ||
| 2533 | |||
| 2534 | /*----------------------------------------------------------------*/ | ||
| 2535 | |||
| 2536 | static struct target_type cache_target = { | ||
| 2537 | .name = "cache", | ||
| 2538 | .version = {1, 0, 0}, | ||
| 2539 | .module = THIS_MODULE, | ||
| 2540 | .ctr = cache_ctr, | ||
| 2541 | .dtr = cache_dtr, | ||
| 2542 | .map = cache_map, | ||
| 2543 | .end_io = cache_end_io, | ||
| 2544 | .postsuspend = cache_postsuspend, | ||
| 2545 | .preresume = cache_preresume, | ||
| 2546 | .resume = cache_resume, | ||
| 2547 | .status = cache_status, | ||
| 2548 | .message = cache_message, | ||
| 2549 | .iterate_devices = cache_iterate_devices, | ||
| 2550 | .merge = cache_bvec_merge, | ||
| 2551 | .io_hints = cache_io_hints, | ||
| 2552 | }; | ||
| 2553 | |||
| 2554 | static int __init dm_cache_init(void) | ||
| 2555 | { | ||
| 2556 | int r; | ||
| 2557 | |||
| 2558 | r = dm_register_target(&cache_target); | ||
| 2559 | if (r) { | ||
| 2560 | DMERR("cache target registration failed: %d", r); | ||
| 2561 | return r; | ||
| 2562 | } | ||
| 2563 | |||
| 2564 | migration_cache = KMEM_CACHE(dm_cache_migration, 0); | ||
| 2565 | if (!migration_cache) { | ||
| 2566 | dm_unregister_target(&cache_target); | ||
| 2567 | return -ENOMEM; | ||
| 2568 | } | ||
| 2569 | |||
| 2570 | return 0; | ||
| 2571 | } | ||
| 2572 | |||
| 2573 | static void __exit dm_cache_exit(void) | ||
| 2574 | { | ||
| 2575 | dm_unregister_target(&cache_target); | ||
| 2576 | kmem_cache_destroy(migration_cache); | ||
| 2577 | } | ||
| 2578 | |||
| 2579 | module_init(dm_cache_init); | ||
| 2580 | module_exit(dm_cache_exit); | ||
| 2581 | |||
| 2582 | MODULE_DESCRIPTION(DM_NAME " cache target"); | ||
| 2583 | MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); | ||
| 2584 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index f7369f9d8595..13c15480d940 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
| @@ -1234,20 +1234,6 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size) | |||
| 1234 | return 0; | 1234 | return 0; |
| 1235 | } | 1235 | } |
| 1236 | 1236 | ||
| 1237 | /* | ||
| 1238 | * Encode key into its hex representation | ||
| 1239 | */ | ||
| 1240 | static void crypt_encode_key(char *hex, u8 *key, unsigned int size) | ||
| 1241 | { | ||
| 1242 | unsigned int i; | ||
| 1243 | |||
| 1244 | for (i = 0; i < size; i++) { | ||
| 1245 | sprintf(hex, "%02x", *key); | ||
| 1246 | hex += 2; | ||
| 1247 | key++; | ||
| 1248 | } | ||
| 1249 | } | ||
| 1250 | |||
| 1251 | static void crypt_free_tfms(struct crypt_config *cc) | 1237 | static void crypt_free_tfms(struct crypt_config *cc) |
| 1252 | { | 1238 | { |
| 1253 | unsigned i; | 1239 | unsigned i; |
| @@ -1651,7 +1637,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1651 | 1637 | ||
| 1652 | if (opt_params == 1 && opt_string && | 1638 | if (opt_params == 1 && opt_string && |
| 1653 | !strcasecmp(opt_string, "allow_discards")) | 1639 | !strcasecmp(opt_string, "allow_discards")) |
| 1654 | ti->num_discard_requests = 1; | 1640 | ti->num_discard_bios = 1; |
| 1655 | else if (opt_params) { | 1641 | else if (opt_params) { |
| 1656 | ret = -EINVAL; | 1642 | ret = -EINVAL; |
| 1657 | ti->error = "Invalid feature arguments"; | 1643 | ti->error = "Invalid feature arguments"; |
| @@ -1679,7 +1665,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1679 | goto bad; | 1665 | goto bad; |
| 1680 | } | 1666 | } |
| 1681 | 1667 | ||
| 1682 | ti->num_flush_requests = 1; | 1668 | ti->num_flush_bios = 1; |
| 1683 | ti->discard_zeroes_data_unsupported = true; | 1669 | ti->discard_zeroes_data_unsupported = true; |
| 1684 | 1670 | ||
| 1685 | return 0; | 1671 | return 0; |
| @@ -1717,11 +1703,11 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) | |||
| 1717 | return DM_MAPIO_SUBMITTED; | 1703 | return DM_MAPIO_SUBMITTED; |
| 1718 | } | 1704 | } |
| 1719 | 1705 | ||
| 1720 | static int crypt_status(struct dm_target *ti, status_type_t type, | 1706 | static void crypt_status(struct dm_target *ti, status_type_t type, |
| 1721 | unsigned status_flags, char *result, unsigned maxlen) | 1707 | unsigned status_flags, char *result, unsigned maxlen) |
| 1722 | { | 1708 | { |
| 1723 | struct crypt_config *cc = ti->private; | 1709 | struct crypt_config *cc = ti->private; |
| 1724 | unsigned int sz = 0; | 1710 | unsigned i, sz = 0; |
| 1725 | 1711 | ||
| 1726 | switch (type) { | 1712 | switch (type) { |
| 1727 | case STATUSTYPE_INFO: | 1713 | case STATUSTYPE_INFO: |
| @@ -1731,27 +1717,20 @@ static int crypt_status(struct dm_target *ti, status_type_t type, | |||
| 1731 | case STATUSTYPE_TABLE: | 1717 | case STATUSTYPE_TABLE: |
| 1732 | DMEMIT("%s ", cc->cipher_string); | 1718 | DMEMIT("%s ", cc->cipher_string); |
| 1733 | 1719 | ||
| 1734 | if (cc->key_size > 0) { | 1720 | if (cc->key_size > 0) |
| 1735 | if ((maxlen - sz) < ((cc->key_size << 1) + 1)) | 1721 | for (i = 0; i < cc->key_size; i++) |
| 1736 | return -ENOMEM; | 1722 | DMEMIT("%02x", cc->key[i]); |
| 1737 | 1723 | else | |
| 1738 | crypt_encode_key(result + sz, cc->key, cc->key_size); | 1724 | DMEMIT("-"); |
| 1739 | sz += cc->key_size << 1; | ||
| 1740 | } else { | ||
| 1741 | if (sz >= maxlen) | ||
| 1742 | return -ENOMEM; | ||
| 1743 | result[sz++] = '-'; | ||
| 1744 | } | ||
| 1745 | 1725 | ||
| 1746 | DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, | 1726 | DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, |
| 1747 | cc->dev->name, (unsigned long long)cc->start); | 1727 | cc->dev->name, (unsigned long long)cc->start); |
| 1748 | 1728 | ||
| 1749 | if (ti->num_discard_requests) | 1729 | if (ti->num_discard_bios) |
| 1750 | DMEMIT(" 1 allow_discards"); | 1730 | DMEMIT(" 1 allow_discards"); |
| 1751 | 1731 | ||
| 1752 | break; | 1732 | break; |
| 1753 | } | 1733 | } |
| 1754 | return 0; | ||
| 1755 | } | 1734 | } |
| 1756 | 1735 | ||
| 1757 | static void crypt_postsuspend(struct dm_target *ti) | 1736 | static void crypt_postsuspend(struct dm_target *ti) |
| @@ -1845,7 +1824,7 @@ static int crypt_iterate_devices(struct dm_target *ti, | |||
| 1845 | 1824 | ||
| 1846 | static struct target_type crypt_target = { | 1825 | static struct target_type crypt_target = { |
| 1847 | .name = "crypt", | 1826 | .name = "crypt", |
| 1848 | .version = {1, 12, 0}, | 1827 | .version = {1, 12, 1}, |
| 1849 | .module = THIS_MODULE, | 1828 | .module = THIS_MODULE, |
| 1850 | .ctr = crypt_ctr, | 1829 | .ctr = crypt_ctr, |
| 1851 | .dtr = crypt_dtr, | 1830 | .dtr = crypt_dtr, |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index cc1bd048acb2..496d5f3646a5 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
| @@ -198,8 +198,8 @@ out: | |||
| 198 | mutex_init(&dc->timer_lock); | 198 | mutex_init(&dc->timer_lock); |
| 199 | atomic_set(&dc->may_delay, 1); | 199 | atomic_set(&dc->may_delay, 1); |
| 200 | 200 | ||
| 201 | ti->num_flush_requests = 1; | 201 | ti->num_flush_bios = 1; |
| 202 | ti->num_discard_requests = 1; | 202 | ti->num_discard_bios = 1; |
| 203 | ti->private = dc; | 203 | ti->private = dc; |
| 204 | return 0; | 204 | return 0; |
| 205 | 205 | ||
| @@ -293,8 +293,8 @@ static int delay_map(struct dm_target *ti, struct bio *bio) | |||
| 293 | return delay_bio(dc, dc->read_delay, bio); | 293 | return delay_bio(dc, dc->read_delay, bio); |
| 294 | } | 294 | } |
| 295 | 295 | ||
| 296 | static int delay_status(struct dm_target *ti, status_type_t type, | 296 | static void delay_status(struct dm_target *ti, status_type_t type, |
| 297 | unsigned status_flags, char *result, unsigned maxlen) | 297 | unsigned status_flags, char *result, unsigned maxlen) |
| 298 | { | 298 | { |
| 299 | struct delay_c *dc = ti->private; | 299 | struct delay_c *dc = ti->private; |
| 300 | int sz = 0; | 300 | int sz = 0; |
| @@ -314,8 +314,6 @@ static int delay_status(struct dm_target *ti, status_type_t type, | |||
| 314 | dc->write_delay); | 314 | dc->write_delay); |
| 315 | break; | 315 | break; |
| 316 | } | 316 | } |
| 317 | |||
| 318 | return 0; | ||
| 319 | } | 317 | } |
| 320 | 318 | ||
| 321 | static int delay_iterate_devices(struct dm_target *ti, | 319 | static int delay_iterate_devices(struct dm_target *ti, |
| @@ -337,7 +335,7 @@ out: | |||
| 337 | 335 | ||
| 338 | static struct target_type delay_target = { | 336 | static struct target_type delay_target = { |
| 339 | .name = "delay", | 337 | .name = "delay", |
| 340 | .version = {1, 2, 0}, | 338 | .version = {1, 2, 1}, |
| 341 | .module = THIS_MODULE, | 339 | .module = THIS_MODULE, |
| 342 | .ctr = delay_ctr, | 340 | .ctr = delay_ctr, |
| 343 | .dtr = delay_dtr, | 341 | .dtr = delay_dtr, |
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 9721f2ffb1a2..7fcf21cb4ff8 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c | |||
| @@ -216,8 +216,8 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 216 | goto bad; | 216 | goto bad; |
| 217 | } | 217 | } |
| 218 | 218 | ||
| 219 | ti->num_flush_requests = 1; | 219 | ti->num_flush_bios = 1; |
| 220 | ti->num_discard_requests = 1; | 220 | ti->num_discard_bios = 1; |
| 221 | ti->per_bio_data_size = sizeof(struct per_bio_data); | 221 | ti->per_bio_data_size = sizeof(struct per_bio_data); |
| 222 | ti->private = fc; | 222 | ti->private = fc; |
| 223 | return 0; | 223 | return 0; |
| @@ -337,8 +337,8 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error) | |||
| 337 | return error; | 337 | return error; |
| 338 | } | 338 | } |
| 339 | 339 | ||
| 340 | static int flakey_status(struct dm_target *ti, status_type_t type, | 340 | static void flakey_status(struct dm_target *ti, status_type_t type, |
| 341 | unsigned status_flags, char *result, unsigned maxlen) | 341 | unsigned status_flags, char *result, unsigned maxlen) |
| 342 | { | 342 | { |
| 343 | unsigned sz = 0; | 343 | unsigned sz = 0; |
| 344 | struct flakey_c *fc = ti->private; | 344 | struct flakey_c *fc = ti->private; |
| @@ -368,7 +368,6 @@ static int flakey_status(struct dm_target *ti, status_type_t type, | |||
| 368 | 368 | ||
| 369 | break; | 369 | break; |
| 370 | } | 370 | } |
| 371 | return 0; | ||
| 372 | } | 371 | } |
| 373 | 372 | ||
| 374 | static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) | 373 | static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) |
| @@ -411,7 +410,7 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_ | |||
| 411 | 410 | ||
| 412 | static struct target_type flakey_target = { | 411 | static struct target_type flakey_target = { |
| 413 | .name = "flakey", | 412 | .name = "flakey", |
| 414 | .version = {1, 3, 0}, | 413 | .version = {1, 3, 1}, |
| 415 | .module = THIS_MODULE, | 414 | .module = THIS_MODULE, |
| 416 | .ctr = flakey_ctr, | 415 | .ctr = flakey_ctr, |
| 417 | .dtr = flakey_dtr, | 416 | .dtr = flakey_dtr, |
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 0666b5d14b88..aa04f0224642 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
| @@ -1067,6 +1067,7 @@ static void retrieve_status(struct dm_table *table, | |||
| 1067 | num_targets = dm_table_get_num_targets(table); | 1067 | num_targets = dm_table_get_num_targets(table); |
| 1068 | for (i = 0; i < num_targets; i++) { | 1068 | for (i = 0; i < num_targets; i++) { |
| 1069 | struct dm_target *ti = dm_table_get_target(table, i); | 1069 | struct dm_target *ti = dm_table_get_target(table, i); |
| 1070 | size_t l; | ||
| 1070 | 1071 | ||
| 1071 | remaining = len - (outptr - outbuf); | 1072 | remaining = len - (outptr - outbuf); |
| 1072 | if (remaining <= sizeof(struct dm_target_spec)) { | 1073 | if (remaining <= sizeof(struct dm_target_spec)) { |
| @@ -1093,14 +1094,17 @@ static void retrieve_status(struct dm_table *table, | |||
| 1093 | if (ti->type->status) { | 1094 | if (ti->type->status) { |
| 1094 | if (param->flags & DM_NOFLUSH_FLAG) | 1095 | if (param->flags & DM_NOFLUSH_FLAG) |
| 1095 | status_flags |= DM_STATUS_NOFLUSH_FLAG; | 1096 | status_flags |= DM_STATUS_NOFLUSH_FLAG; |
| 1096 | if (ti->type->status(ti, type, status_flags, outptr, remaining)) { | 1097 | ti->type->status(ti, type, status_flags, outptr, remaining); |
| 1097 | param->flags |= DM_BUFFER_FULL_FLAG; | ||
| 1098 | break; | ||
| 1099 | } | ||
| 1100 | } else | 1098 | } else |
| 1101 | outptr[0] = '\0'; | 1099 | outptr[0] = '\0'; |
| 1102 | 1100 | ||
| 1103 | outptr += strlen(outptr) + 1; | 1101 | l = strlen(outptr) + 1; |
| 1102 | if (l == remaining) { | ||
| 1103 | param->flags |= DM_BUFFER_FULL_FLAG; | ||
| 1104 | break; | ||
| 1105 | } | ||
| 1106 | |||
| 1107 | outptr += l; | ||
| 1104 | used = param->data_start + (outptr - outbuf); | 1108 | used = param->data_start + (outptr - outbuf); |
| 1105 | 1109 | ||
| 1106 | outptr = align_ptr(outptr); | 1110 | outptr = align_ptr(outptr); |
| @@ -1410,6 +1414,22 @@ static int table_status(struct dm_ioctl *param, size_t param_size) | |||
| 1410 | return 0; | 1414 | return 0; |
| 1411 | } | 1415 | } |
| 1412 | 1416 | ||
| 1417 | static bool buffer_test_overflow(char *result, unsigned maxlen) | ||
| 1418 | { | ||
| 1419 | return !maxlen || strlen(result) + 1 >= maxlen; | ||
| 1420 | } | ||
| 1421 | |||
| 1422 | /* | ||
| 1423 | * Process device-mapper dependent messages. | ||
| 1424 | * Returns a number <= 1 if message was processed by device mapper. | ||
| 1425 | * Returns 2 if message should be delivered to the target. | ||
| 1426 | */ | ||
| 1427 | static int message_for_md(struct mapped_device *md, unsigned argc, char **argv, | ||
| 1428 | char *result, unsigned maxlen) | ||
| 1429 | { | ||
| 1430 | return 2; | ||
| 1431 | } | ||
| 1432 | |||
| 1413 | /* | 1433 | /* |
| 1414 | * Pass a message to the target that's at the supplied device offset. | 1434 | * Pass a message to the target that's at the supplied device offset. |
| 1415 | */ | 1435 | */ |
| @@ -1421,6 +1441,8 @@ static int target_message(struct dm_ioctl *param, size_t param_size) | |||
| 1421 | struct dm_table *table; | 1441 | struct dm_table *table; |
| 1422 | struct dm_target *ti; | 1442 | struct dm_target *ti; |
| 1423 | struct dm_target_msg *tmsg = (void *) param + param->data_start; | 1443 | struct dm_target_msg *tmsg = (void *) param + param->data_start; |
| 1444 | size_t maxlen; | ||
| 1445 | char *result = get_result_buffer(param, param_size, &maxlen); | ||
| 1424 | 1446 | ||
| 1425 | md = find_device(param); | 1447 | md = find_device(param); |
| 1426 | if (!md) | 1448 | if (!md) |
| @@ -1444,6 +1466,10 @@ static int target_message(struct dm_ioctl *param, size_t param_size) | |||
| 1444 | goto out_argv; | 1466 | goto out_argv; |
| 1445 | } | 1467 | } |
| 1446 | 1468 | ||
| 1469 | r = message_for_md(md, argc, argv, result, maxlen); | ||
| 1470 | if (r <= 1) | ||
| 1471 | goto out_argv; | ||
| 1472 | |||
| 1447 | table = dm_get_live_table(md); | 1473 | table = dm_get_live_table(md); |
| 1448 | if (!table) | 1474 | if (!table) |
| 1449 | goto out_argv; | 1475 | goto out_argv; |
| @@ -1469,44 +1495,68 @@ static int target_message(struct dm_ioctl *param, size_t param_size) | |||
| 1469 | out_argv: | 1495 | out_argv: |
| 1470 | kfree(argv); | 1496 | kfree(argv); |
| 1471 | out: | 1497 | out: |
| 1472 | param->data_size = 0; | 1498 | if (r >= 0) |
| 1499 | __dev_status(md, param); | ||
| 1500 | |||
| 1501 | if (r == 1) { | ||
| 1502 | param->flags |= DM_DATA_OUT_FLAG; | ||
| 1503 | if (buffer_test_overflow(result, maxlen)) | ||
| 1504 | param->flags |= DM_BUFFER_FULL_FLAG; | ||
| 1505 | else | ||
| 1506 | param->data_size = param->data_start + strlen(result) + 1; | ||
| 1507 | r = 0; | ||
| 1508 | } | ||
| 1509 | |||
| 1473 | dm_put(md); | 1510 | dm_put(md); |
| 1474 | return r; | 1511 | return r; |
| 1475 | } | 1512 | } |
| 1476 | 1513 | ||
| 1514 | /* | ||
| 1515 | * The ioctl parameter block consists of two parts, a dm_ioctl struct | ||
| 1516 | * followed by a data buffer. This flag is set if the second part, | ||
| 1517 | * which has a variable size, is not used by the function processing | ||
| 1518 | * the ioctl. | ||
| 1519 | */ | ||
| 1520 | #define IOCTL_FLAGS_NO_PARAMS 1 | ||
| 1521 | |||
| 1477 | /*----------------------------------------------------------------- | 1522 | /*----------------------------------------------------------------- |
| 1478 | * Implementation of open/close/ioctl on the special char | 1523 | * Implementation of open/close/ioctl on the special char |
| 1479 | * device. | 1524 | * device. |
| 1480 | *---------------------------------------------------------------*/ | 1525 | *---------------------------------------------------------------*/ |
| 1481 | static ioctl_fn lookup_ioctl(unsigned int cmd) | 1526 | static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) |
| 1482 | { | 1527 | { |
| 1483 | static struct { | 1528 | static struct { |
| 1484 | int cmd; | 1529 | int cmd; |
| 1530 | int flags; | ||
| 1485 | ioctl_fn fn; | 1531 | ioctl_fn fn; |
| 1486 | } _ioctls[] = { | 1532 | } _ioctls[] = { |
| 1487 | {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */ | 1533 | {DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */ |
| 1488 | {DM_REMOVE_ALL_CMD, remove_all}, | 1534 | {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS, remove_all}, |
| 1489 | {DM_LIST_DEVICES_CMD, list_devices}, | 1535 | {DM_LIST_DEVICES_CMD, 0, list_devices}, |
| 1490 | 1536 | ||
| 1491 | {DM_DEV_CREATE_CMD, dev_create}, | 1537 | {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create}, |
| 1492 | {DM_DEV_REMOVE_CMD, dev_remove}, | 1538 | {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_remove}, |
| 1493 | {DM_DEV_RENAME_CMD, dev_rename}, | 1539 | {DM_DEV_RENAME_CMD, 0, dev_rename}, |
| 1494 | {DM_DEV_SUSPEND_CMD, dev_suspend}, | 1540 | {DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend}, |
| 1495 | {DM_DEV_STATUS_CMD, dev_status}, | 1541 | {DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status}, |
| 1496 | {DM_DEV_WAIT_CMD, dev_wait}, | 1542 | {DM_DEV_WAIT_CMD, 0, dev_wait}, |
| 1497 | 1543 | ||
| 1498 | {DM_TABLE_LOAD_CMD, table_load}, | 1544 | {DM_TABLE_LOAD_CMD, 0, table_load}, |
| 1499 | {DM_TABLE_CLEAR_CMD, table_clear}, | 1545 | {DM_TABLE_CLEAR_CMD, IOCTL_FLAGS_NO_PARAMS, table_clear}, |
| 1500 | {DM_TABLE_DEPS_CMD, table_deps}, | 1546 | {DM_TABLE_DEPS_CMD, 0, table_deps}, |
| 1501 | {DM_TABLE_STATUS_CMD, table_status}, | 1547 | {DM_TABLE_STATUS_CMD, 0, table_status}, |
| 1502 | 1548 | ||
| 1503 | {DM_LIST_VERSIONS_CMD, list_versions}, | 1549 | {DM_LIST_VERSIONS_CMD, 0, list_versions}, |
| 1504 | 1550 | ||
| 1505 | {DM_TARGET_MSG_CMD, target_message}, | 1551 | {DM_TARGET_MSG_CMD, 0, target_message}, |
| 1506 | {DM_DEV_SET_GEOMETRY_CMD, dev_set_geometry} | 1552 | {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry} |
| 1507 | }; | 1553 | }; |
| 1508 | 1554 | ||
| 1509 | return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; | 1555 | if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) |
| 1556 | return NULL; | ||
| 1557 | |||
| 1558 | *ioctl_flags = _ioctls[cmd].flags; | ||
| 1559 | return _ioctls[cmd].fn; | ||
| 1510 | } | 1560 | } |
| 1511 | 1561 | ||
| 1512 | /* | 1562 | /* |
| @@ -1543,7 +1593,8 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user) | |||
| 1543 | return r; | 1593 | return r; |
| 1544 | } | 1594 | } |
| 1545 | 1595 | ||
| 1546 | #define DM_PARAMS_VMALLOC 0x0001 /* Params alloced with vmalloc not kmalloc */ | 1596 | #define DM_PARAMS_KMALLOC 0x0001 /* Params alloced with kmalloc */ |
| 1597 | #define DM_PARAMS_VMALLOC 0x0002 /* Params alloced with vmalloc */ | ||
| 1547 | #define DM_WIPE_BUFFER 0x0010 /* Wipe input buffer before returning from ioctl */ | 1598 | #define DM_WIPE_BUFFER 0x0010 /* Wipe input buffer before returning from ioctl */ |
| 1548 | 1599 | ||
| 1549 | static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags) | 1600 | static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags) |
| @@ -1551,66 +1602,80 @@ static void free_params(struct dm_ioctl *param, size_t param_size, int param_fla | |||
| 1551 | if (param_flags & DM_WIPE_BUFFER) | 1602 | if (param_flags & DM_WIPE_BUFFER) |
| 1552 | memset(param, 0, param_size); | 1603 | memset(param, 0, param_size); |
| 1553 | 1604 | ||
| 1605 | if (param_flags & DM_PARAMS_KMALLOC) | ||
| 1606 | kfree(param); | ||
| 1554 | if (param_flags & DM_PARAMS_VMALLOC) | 1607 | if (param_flags & DM_PARAMS_VMALLOC) |
| 1555 | vfree(param); | 1608 | vfree(param); |
| 1556 | else | ||
| 1557 | kfree(param); | ||
| 1558 | } | 1609 | } |
| 1559 | 1610 | ||
| 1560 | static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param, int *param_flags) | 1611 | static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel, |
| 1612 | int ioctl_flags, | ||
| 1613 | struct dm_ioctl **param, int *param_flags) | ||
| 1561 | { | 1614 | { |
| 1562 | struct dm_ioctl tmp, *dmi; | 1615 | struct dm_ioctl *dmi; |
| 1563 | int secure_data; | 1616 | int secure_data; |
| 1617 | const size_t minimum_data_size = sizeof(*param_kernel) - sizeof(param_kernel->data); | ||
| 1564 | 1618 | ||
| 1565 | if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data))) | 1619 | if (copy_from_user(param_kernel, user, minimum_data_size)) |
| 1566 | return -EFAULT; | 1620 | return -EFAULT; |
| 1567 | 1621 | ||
| 1568 | if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data))) | 1622 | if (param_kernel->data_size < minimum_data_size) |
| 1569 | return -EINVAL; | 1623 | return -EINVAL; |
| 1570 | 1624 | ||
| 1571 | secure_data = tmp.flags & DM_SECURE_DATA_FLAG; | 1625 | secure_data = param_kernel->flags & DM_SECURE_DATA_FLAG; |
| 1572 | 1626 | ||
| 1573 | *param_flags = secure_data ? DM_WIPE_BUFFER : 0; | 1627 | *param_flags = secure_data ? DM_WIPE_BUFFER : 0; |
| 1574 | 1628 | ||
| 1629 | if (ioctl_flags & IOCTL_FLAGS_NO_PARAMS) { | ||
| 1630 | dmi = param_kernel; | ||
| 1631 | dmi->data_size = minimum_data_size; | ||
| 1632 | goto data_copied; | ||
| 1633 | } | ||
| 1634 | |||
| 1575 | /* | 1635 | /* |
| 1576 | * Try to avoid low memory issues when a device is suspended. | 1636 | * Try to avoid low memory issues when a device is suspended. |
| 1577 | * Use kmalloc() rather than vmalloc() when we can. | 1637 | * Use kmalloc() rather than vmalloc() when we can. |
| 1578 | */ | 1638 | */ |
| 1579 | dmi = NULL; | 1639 | dmi = NULL; |
| 1580 | if (tmp.data_size <= KMALLOC_MAX_SIZE) | 1640 | if (param_kernel->data_size <= KMALLOC_MAX_SIZE) { |
| 1581 | dmi = kmalloc(tmp.data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); | 1641 | dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); |
| 1642 | if (dmi) | ||
| 1643 | *param_flags |= DM_PARAMS_KMALLOC; | ||
| 1644 | } | ||
| 1582 | 1645 | ||
| 1583 | if (!dmi) { | 1646 | if (!dmi) { |
| 1584 | dmi = __vmalloc(tmp.data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL); | 1647 | dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH, PAGE_KERNEL); |
| 1585 | *param_flags |= DM_PARAMS_VMALLOC; | 1648 | if (dmi) |
| 1649 | *param_flags |= DM_PARAMS_VMALLOC; | ||
| 1586 | } | 1650 | } |
| 1587 | 1651 | ||
| 1588 | if (!dmi) { | 1652 | if (!dmi) { |
| 1589 | if (secure_data && clear_user(user, tmp.data_size)) | 1653 | if (secure_data && clear_user(user, param_kernel->data_size)) |
| 1590 | return -EFAULT; | 1654 | return -EFAULT; |
| 1591 | return -ENOMEM; | 1655 | return -ENOMEM; |
| 1592 | } | 1656 | } |
| 1593 | 1657 | ||
| 1594 | if (copy_from_user(dmi, user, tmp.data_size)) | 1658 | if (copy_from_user(dmi, user, param_kernel->data_size)) |
| 1595 | goto bad; | 1659 | goto bad; |
| 1596 | 1660 | ||
| 1661 | data_copied: | ||
| 1597 | /* | 1662 | /* |
| 1598 | * Abort if something changed the ioctl data while it was being copied. | 1663 | * Abort if something changed the ioctl data while it was being copied. |
| 1599 | */ | 1664 | */ |
| 1600 | if (dmi->data_size != tmp.data_size) { | 1665 | if (dmi->data_size != param_kernel->data_size) { |
| 1601 | DMERR("rejecting ioctl: data size modified while processing parameters"); | 1666 | DMERR("rejecting ioctl: data size modified while processing parameters"); |
| 1602 | goto bad; | 1667 | goto bad; |
| 1603 | } | 1668 | } |
| 1604 | 1669 | ||
| 1605 | /* Wipe the user buffer so we do not return it to userspace */ | 1670 | /* Wipe the user buffer so we do not return it to userspace */ |
| 1606 | if (secure_data && clear_user(user, tmp.data_size)) | 1671 | if (secure_data && clear_user(user, param_kernel->data_size)) |
| 1607 | goto bad; | 1672 | goto bad; |
| 1608 | 1673 | ||
| 1609 | *param = dmi; | 1674 | *param = dmi; |
| 1610 | return 0; | 1675 | return 0; |
| 1611 | 1676 | ||
| 1612 | bad: | 1677 | bad: |
| 1613 | free_params(dmi, tmp.data_size, *param_flags); | 1678 | free_params(dmi, param_kernel->data_size, *param_flags); |
| 1614 | 1679 | ||
| 1615 | return -EFAULT; | 1680 | return -EFAULT; |
| 1616 | } | 1681 | } |
| @@ -1621,6 +1686,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param) | |||
| 1621 | param->flags &= ~DM_BUFFER_FULL_FLAG; | 1686 | param->flags &= ~DM_BUFFER_FULL_FLAG; |
| 1622 | param->flags &= ~DM_UEVENT_GENERATED_FLAG; | 1687 | param->flags &= ~DM_UEVENT_GENERATED_FLAG; |
| 1623 | param->flags &= ~DM_SECURE_DATA_FLAG; | 1688 | param->flags &= ~DM_SECURE_DATA_FLAG; |
| 1689 | param->flags &= ~DM_DATA_OUT_FLAG; | ||
| 1624 | 1690 | ||
| 1625 | /* Ignores parameters */ | 1691 | /* Ignores parameters */ |
| 1626 | if (cmd == DM_REMOVE_ALL_CMD || | 1692 | if (cmd == DM_REMOVE_ALL_CMD || |
| @@ -1648,11 +1714,13 @@ static int validate_params(uint cmd, struct dm_ioctl *param) | |||
| 1648 | static int ctl_ioctl(uint command, struct dm_ioctl __user *user) | 1714 | static int ctl_ioctl(uint command, struct dm_ioctl __user *user) |
| 1649 | { | 1715 | { |
| 1650 | int r = 0; | 1716 | int r = 0; |
| 1717 | int ioctl_flags; | ||
| 1651 | int param_flags; | 1718 | int param_flags; |
| 1652 | unsigned int cmd; | 1719 | unsigned int cmd; |
| 1653 | struct dm_ioctl *uninitialized_var(param); | 1720 | struct dm_ioctl *uninitialized_var(param); |
| 1654 | ioctl_fn fn = NULL; | 1721 | ioctl_fn fn = NULL; |
| 1655 | size_t input_param_size; | 1722 | size_t input_param_size; |
| 1723 | struct dm_ioctl param_kernel; | ||
| 1656 | 1724 | ||
| 1657 | /* only root can play with this */ | 1725 | /* only root can play with this */ |
| 1658 | if (!capable(CAP_SYS_ADMIN)) | 1726 | if (!capable(CAP_SYS_ADMIN)) |
| @@ -1677,7 +1745,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) | |||
| 1677 | if (cmd == DM_VERSION_CMD) | 1745 | if (cmd == DM_VERSION_CMD) |
| 1678 | return 0; | 1746 | return 0; |
| 1679 | 1747 | ||
| 1680 | fn = lookup_ioctl(cmd); | 1748 | fn = lookup_ioctl(cmd, &ioctl_flags); |
| 1681 | if (!fn) { | 1749 | if (!fn) { |
| 1682 | DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); | 1750 | DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); |
| 1683 | return -ENOTTY; | 1751 | return -ENOTTY; |
| @@ -1686,7 +1754,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) | |||
| 1686 | /* | 1754 | /* |
| 1687 | * Copy the parameters into kernel space. | 1755 | * Copy the parameters into kernel space. |
| 1688 | */ | 1756 | */ |
| 1689 | r = copy_params(user, ¶m, ¶m_flags); | 1757 | r = copy_params(user, ¶m_kernel, ioctl_flags, ¶m, ¶m_flags); |
| 1690 | 1758 | ||
| 1691 | if (r) | 1759 | if (r) |
| 1692 | return r; | 1760 | return r; |
| @@ -1699,6 +1767,10 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) | |||
| 1699 | param->data_size = sizeof(*param); | 1767 | param->data_size = sizeof(*param); |
| 1700 | r = fn(param, input_param_size); | 1768 | r = fn(param, input_param_size); |
| 1701 | 1769 | ||
| 1770 | if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) && | ||
| 1771 | unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS)) | ||
| 1772 | DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd); | ||
| 1773 | |||
| 1702 | /* | 1774 | /* |
| 1703 | * Copy the results back to userland. | 1775 | * Copy the results back to userland. |
| 1704 | */ | 1776 | */ |
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 68c02673263b..d581fe5d2faf 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/vmalloc.h> | 22 | #include <linux/vmalloc.h> |
| 23 | #include <linux/workqueue.h> | 23 | #include <linux/workqueue.h> |
| 24 | #include <linux/mutex.h> | 24 | #include <linux/mutex.h> |
| 25 | #include <linux/delay.h> | ||
| 25 | #include <linux/device-mapper.h> | 26 | #include <linux/device-mapper.h> |
| 26 | #include <linux/dm-kcopyd.h> | 27 | #include <linux/dm-kcopyd.h> |
| 27 | 28 | ||
| @@ -51,6 +52,8 @@ struct dm_kcopyd_client { | |||
| 51 | struct workqueue_struct *kcopyd_wq; | 52 | struct workqueue_struct *kcopyd_wq; |
| 52 | struct work_struct kcopyd_work; | 53 | struct work_struct kcopyd_work; |
| 53 | 54 | ||
| 55 | struct dm_kcopyd_throttle *throttle; | ||
| 56 | |||
| 54 | /* | 57 | /* |
| 55 | * We maintain three lists of jobs: | 58 | * We maintain three lists of jobs: |
| 56 | * | 59 | * |
| @@ -68,6 +71,117 @@ struct dm_kcopyd_client { | |||
| 68 | 71 | ||
| 69 | static struct page_list zero_page_list; | 72 | static struct page_list zero_page_list; |
| 70 | 73 | ||
| 74 | static DEFINE_SPINLOCK(throttle_spinlock); | ||
| 75 | |||
| 76 | /* | ||
| 77 | * IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period. | ||
| 78 | * When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided | ||
| 79 | * by 2. | ||
| 80 | */ | ||
| 81 | #define ACCOUNT_INTERVAL_SHIFT SHIFT_HZ | ||
| 82 | |||
| 83 | /* | ||
| 84 | * Sleep this number of milliseconds. | ||
| 85 | * | ||
| 86 | * The value was decided experimentally. | ||
| 87 | * Smaller values seem to cause an increased copy rate above the limit. | ||
| 88 | * The reason for this is unknown but possibly due to jiffies rounding errors | ||
| 89 | * or read/write cache inside the disk. | ||
| 90 | */ | ||
| 91 | #define SLEEP_MSEC 100 | ||
| 92 | |||
| 93 | /* | ||
| 94 | * Maximum number of sleep events. There is a theoretical livelock if more | ||
| 95 | * kcopyd clients do work simultaneously which this limit avoids. | ||
| 96 | */ | ||
| 97 | #define MAX_SLEEPS 10 | ||
| 98 | |||
| 99 | static void io_job_start(struct dm_kcopyd_throttle *t) | ||
| 100 | { | ||
| 101 | unsigned throttle, now, difference; | ||
| 102 | int slept = 0, skew; | ||
| 103 | |||
| 104 | if (unlikely(!t)) | ||
| 105 | return; | ||
| 106 | |||
| 107 | try_again: | ||
| 108 | spin_lock_irq(&throttle_spinlock); | ||
| 109 | |||
| 110 | throttle = ACCESS_ONCE(t->throttle); | ||
| 111 | |||
| 112 | if (likely(throttle >= 100)) | ||
| 113 | goto skip_limit; | ||
| 114 | |||
| 115 | now = jiffies; | ||
| 116 | difference = now - t->last_jiffies; | ||
| 117 | t->last_jiffies = now; | ||
| 118 | if (t->num_io_jobs) | ||
| 119 | t->io_period += difference; | ||
| 120 | t->total_period += difference; | ||
| 121 | |||
| 122 | /* | ||
| 123 | * Maintain sane values if we got a temporary overflow. | ||
| 124 | */ | ||
| 125 | if (unlikely(t->io_period > t->total_period)) | ||
| 126 | t->io_period = t->total_period; | ||
| 127 | |||
| 128 | if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) { | ||
| 129 | int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT); | ||
| 130 | t->total_period >>= shift; | ||
| 131 | t->io_period >>= shift; | ||
| 132 | } | ||
| 133 | |||
| 134 | skew = t->io_period - throttle * t->total_period / 100; | ||
| 135 | |||
| 136 | if (unlikely(skew > 0) && slept < MAX_SLEEPS) { | ||
| 137 | slept++; | ||
| 138 | spin_unlock_irq(&throttle_spinlock); | ||
| 139 | msleep(SLEEP_MSEC); | ||
| 140 | goto try_again; | ||
| 141 | } | ||
| 142 | |||
| 143 | skip_limit: | ||
| 144 | t->num_io_jobs++; | ||
| 145 | |||
| 146 | spin_unlock_irq(&throttle_spinlock); | ||
| 147 | } | ||
| 148 | |||
| 149 | static void io_job_finish(struct dm_kcopyd_throttle *t) | ||
| 150 | { | ||
| 151 | unsigned long flags; | ||
| 152 | |||
| 153 | if (unlikely(!t)) | ||
| 154 | return; | ||
| 155 | |||
| 156 | spin_lock_irqsave(&throttle_spinlock, flags); | ||
| 157 | |||
| 158 | t->num_io_jobs--; | ||
| 159 | |||
| 160 | if (likely(ACCESS_ONCE(t->throttle) >= 100)) | ||
| 161 | goto skip_limit; | ||
| 162 | |||
| 163 | if (!t->num_io_jobs) { | ||
| 164 | unsigned now, difference; | ||
| 165 | |||
| 166 | now = jiffies; | ||
| 167 | difference = now - t->last_jiffies; | ||
| 168 | t->last_jiffies = now; | ||
| 169 | |||
| 170 | t->io_period += difference; | ||
| 171 | t->total_period += difference; | ||
| 172 | |||
| 173 | /* | ||
| 174 | * Maintain sane values if we got a temporary overflow. | ||
| 175 | */ | ||
| 176 | if (unlikely(t->io_period > t->total_period)) | ||
| 177 | t->io_period = t->total_period; | ||
| 178 | } | ||
| 179 | |||
| 180 | skip_limit: | ||
| 181 | spin_unlock_irqrestore(&throttle_spinlock, flags); | ||
| 182 | } | ||
| 183 | |||
| 184 | |||
| 71 | static void wake(struct dm_kcopyd_client *kc) | 185 | static void wake(struct dm_kcopyd_client *kc) |
| 72 | { | 186 | { |
| 73 | queue_work(kc->kcopyd_wq, &kc->kcopyd_work); | 187 | queue_work(kc->kcopyd_wq, &kc->kcopyd_work); |
| @@ -348,6 +462,8 @@ static void complete_io(unsigned long error, void *context) | |||
| 348 | struct kcopyd_job *job = (struct kcopyd_job *) context; | 462 | struct kcopyd_job *job = (struct kcopyd_job *) context; |
| 349 | struct dm_kcopyd_client *kc = job->kc; | 463 | struct dm_kcopyd_client *kc = job->kc; |
| 350 | 464 | ||
| 465 | io_job_finish(kc->throttle); | ||
| 466 | |||
| 351 | if (error) { | 467 | if (error) { |
| 352 | if (job->rw & WRITE) | 468 | if (job->rw & WRITE) |
| 353 | job->write_err |= error; | 469 | job->write_err |= error; |
| @@ -389,6 +505,8 @@ static int run_io_job(struct kcopyd_job *job) | |||
| 389 | .client = job->kc->io_client, | 505 | .client = job->kc->io_client, |
| 390 | }; | 506 | }; |
| 391 | 507 | ||
| 508 | io_job_start(job->kc->throttle); | ||
| 509 | |||
| 392 | if (job->rw == READ) | 510 | if (job->rw == READ) |
| 393 | r = dm_io(&io_req, 1, &job->source, NULL); | 511 | r = dm_io(&io_req, 1, &job->source, NULL); |
| 394 | else | 512 | else |
| @@ -695,7 +813,7 @@ int kcopyd_cancel(struct kcopyd_job *job, int block) | |||
| 695 | /*----------------------------------------------------------------- | 813 | /*----------------------------------------------------------------- |
| 696 | * Client setup | 814 | * Client setup |
| 697 | *---------------------------------------------------------------*/ | 815 | *---------------------------------------------------------------*/ |
| 698 | struct dm_kcopyd_client *dm_kcopyd_client_create(void) | 816 | struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle) |
| 699 | { | 817 | { |
| 700 | int r = -ENOMEM; | 818 | int r = -ENOMEM; |
| 701 | struct dm_kcopyd_client *kc; | 819 | struct dm_kcopyd_client *kc; |
| @@ -708,6 +826,7 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(void) | |||
| 708 | INIT_LIST_HEAD(&kc->complete_jobs); | 826 | INIT_LIST_HEAD(&kc->complete_jobs); |
| 709 | INIT_LIST_HEAD(&kc->io_jobs); | 827 | INIT_LIST_HEAD(&kc->io_jobs); |
| 710 | INIT_LIST_HEAD(&kc->pages_jobs); | 828 | INIT_LIST_HEAD(&kc->pages_jobs); |
| 829 | kc->throttle = throttle; | ||
| 711 | 830 | ||
| 712 | kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); | 831 | kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); |
| 713 | if (!kc->job_pool) | 832 | if (!kc->job_pool) |
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 328cad5617ab..4f99d267340c 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c | |||
| @@ -53,9 +53,9 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 53 | goto bad; | 53 | goto bad; |
| 54 | } | 54 | } |
| 55 | 55 | ||
| 56 | ti->num_flush_requests = 1; | 56 | ti->num_flush_bios = 1; |
| 57 | ti->num_discard_requests = 1; | 57 | ti->num_discard_bios = 1; |
| 58 | ti->num_write_same_requests = 1; | 58 | ti->num_write_same_bios = 1; |
| 59 | ti->private = lc; | 59 | ti->private = lc; |
| 60 | return 0; | 60 | return 0; |
| 61 | 61 | ||
| @@ -95,8 +95,8 @@ static int linear_map(struct dm_target *ti, struct bio *bio) | |||
| 95 | return DM_MAPIO_REMAPPED; | 95 | return DM_MAPIO_REMAPPED; |
| 96 | } | 96 | } |
| 97 | 97 | ||
| 98 | static int linear_status(struct dm_target *ti, status_type_t type, | 98 | static void linear_status(struct dm_target *ti, status_type_t type, |
| 99 | unsigned status_flags, char *result, unsigned maxlen) | 99 | unsigned status_flags, char *result, unsigned maxlen) |
| 100 | { | 100 | { |
| 101 | struct linear_c *lc = (struct linear_c *) ti->private; | 101 | struct linear_c *lc = (struct linear_c *) ti->private; |
| 102 | 102 | ||
| @@ -110,7 +110,6 @@ static int linear_status(struct dm_target *ti, status_type_t type, | |||
| 110 | (unsigned long long)lc->start); | 110 | (unsigned long long)lc->start); |
| 111 | break; | 111 | break; |
| 112 | } | 112 | } |
| 113 | return 0; | ||
| 114 | } | 113 | } |
| 115 | 114 | ||
| 116 | static int linear_ioctl(struct dm_target *ti, unsigned int cmd, | 115 | static int linear_ioctl(struct dm_target *ti, unsigned int cmd, |
| @@ -155,7 +154,7 @@ static int linear_iterate_devices(struct dm_target *ti, | |||
| 155 | 154 | ||
| 156 | static struct target_type linear_target = { | 155 | static struct target_type linear_target = { |
| 157 | .name = "linear", | 156 | .name = "linear", |
| 158 | .version = {1, 2, 0}, | 157 | .version = {1, 2, 1}, |
| 159 | .module = THIS_MODULE, | 158 | .module = THIS_MODULE, |
| 160 | .ctr = linear_ctr, | 159 | .ctr = linear_ctr, |
| 161 | .dtr = linear_dtr, | 160 | .dtr = linear_dtr, |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 573bd04591bf..51bb81676be3 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
| @@ -905,8 +905,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, | |||
| 905 | goto bad; | 905 | goto bad; |
| 906 | } | 906 | } |
| 907 | 907 | ||
| 908 | ti->num_flush_requests = 1; | 908 | ti->num_flush_bios = 1; |
| 909 | ti->num_discard_requests = 1; | 909 | ti->num_discard_bios = 1; |
| 910 | 910 | ||
| 911 | return 0; | 911 | return 0; |
| 912 | 912 | ||
| @@ -1378,8 +1378,8 @@ static void multipath_resume(struct dm_target *ti) | |||
| 1378 | * [priority selector-name num_ps_args [ps_args]* | 1378 | * [priority selector-name num_ps_args [ps_args]* |
| 1379 | * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ | 1379 | * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ |
| 1380 | */ | 1380 | */ |
| 1381 | static int multipath_status(struct dm_target *ti, status_type_t type, | 1381 | static void multipath_status(struct dm_target *ti, status_type_t type, |
| 1382 | unsigned status_flags, char *result, unsigned maxlen) | 1382 | unsigned status_flags, char *result, unsigned maxlen) |
| 1383 | { | 1383 | { |
| 1384 | int sz = 0; | 1384 | int sz = 0; |
| 1385 | unsigned long flags; | 1385 | unsigned long flags; |
| @@ -1485,8 +1485,6 @@ static int multipath_status(struct dm_target *ti, status_type_t type, | |||
| 1485 | } | 1485 | } |
| 1486 | 1486 | ||
| 1487 | spin_unlock_irqrestore(&m->lock, flags); | 1487 | spin_unlock_irqrestore(&m->lock, flags); |
| 1488 | |||
| 1489 | return 0; | ||
| 1490 | } | 1488 | } |
| 1491 | 1489 | ||
| 1492 | static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) | 1490 | static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) |
| @@ -1695,7 +1693,7 @@ out: | |||
| 1695 | *---------------------------------------------------------------*/ | 1693 | *---------------------------------------------------------------*/ |
| 1696 | static struct target_type multipath_target = { | 1694 | static struct target_type multipath_target = { |
| 1697 | .name = "multipath", | 1695 | .name = "multipath", |
| 1698 | .version = {1, 5, 0}, | 1696 | .version = {1, 5, 1}, |
| 1699 | .module = THIS_MODULE, | 1697 | .module = THIS_MODULE, |
| 1700 | .ctr = multipath_ctr, | 1698 | .ctr = multipath_ctr, |
| 1701 | .dtr = multipath_dtr, | 1699 | .dtr = multipath_dtr, |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 9e58dbd8d8cb..311e3d35b272 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
| @@ -91,15 +91,44 @@ static struct raid_type { | |||
| 91 | {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} | 91 | {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} |
| 92 | }; | 92 | }; |
| 93 | 93 | ||
| 94 | static char *raid10_md_layout_to_format(int layout) | ||
| 95 | { | ||
| 96 | /* | ||
| 97 | * Bit 16 and 17 stand for "offset" and "use_far_sets" | ||
| 98 | * Refer to MD's raid10.c for details | ||
| 99 | */ | ||
| 100 | if ((layout & 0x10000) && (layout & 0x20000)) | ||
| 101 | return "offset"; | ||
| 102 | |||
| 103 | if ((layout & 0xFF) > 1) | ||
| 104 | return "near"; | ||
| 105 | |||
| 106 | return "far"; | ||
| 107 | } | ||
| 108 | |||
| 94 | static unsigned raid10_md_layout_to_copies(int layout) | 109 | static unsigned raid10_md_layout_to_copies(int layout) |
| 95 | { | 110 | { |
| 96 | return layout & 0xFF; | 111 | if ((layout & 0xFF) > 1) |
| 112 | return layout & 0xFF; | ||
| 113 | return (layout >> 8) & 0xFF; | ||
| 97 | } | 114 | } |
| 98 | 115 | ||
| 99 | static int raid10_format_to_md_layout(char *format, unsigned copies) | 116 | static int raid10_format_to_md_layout(char *format, unsigned copies) |
| 100 | { | 117 | { |
| 101 | /* 1 "far" copy, and 'copies' "near" copies */ | 118 | unsigned n = 1, f = 1; |
| 102 | return (1 << 8) | (copies & 0xFF); | 119 | |
| 120 | if (!strcmp("near", format)) | ||
| 121 | n = copies; | ||
| 122 | else | ||
| 123 | f = copies; | ||
| 124 | |||
| 125 | if (!strcmp("offset", format)) | ||
| 126 | return 0x30000 | (f << 8) | n; | ||
| 127 | |||
| 128 | if (!strcmp("far", format)) | ||
| 129 | return 0x20000 | (f << 8) | n; | ||
| 130 | |||
| 131 | return (f << 8) | n; | ||
| 103 | } | 132 | } |
| 104 | 133 | ||
| 105 | static struct raid_type *get_raid_type(char *name) | 134 | static struct raid_type *get_raid_type(char *name) |
| @@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs) | |||
| 352 | { | 381 | { |
| 353 | unsigned i, rebuild_cnt = 0; | 382 | unsigned i, rebuild_cnt = 0; |
| 354 | unsigned rebuilds_per_group, copies, d; | 383 | unsigned rebuilds_per_group, copies, d; |
| 384 | unsigned group_size, last_group_start; | ||
| 355 | 385 | ||
| 356 | for (i = 0; i < rs->md.raid_disks; i++) | 386 | for (i = 0; i < rs->md.raid_disks; i++) |
| 357 | if (!test_bit(In_sync, &rs->dev[i].rdev.flags) || | 387 | if (!test_bit(In_sync, &rs->dev[i].rdev.flags) || |
| @@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs) | |||
| 379 | * as long as the failed devices occur in different mirror | 409 | * as long as the failed devices occur in different mirror |
| 380 | * groups (i.e. different stripes). | 410 | * groups (i.e. different stripes). |
| 381 | * | 411 | * |
| 382 | * Right now, we only allow for "near" copies. When other | ||
| 383 | * formats are added, we will have to check those too. | ||
| 384 | * | ||
| 385 | * When checking "near" format, make sure no adjacent devices | 412 | * When checking "near" format, make sure no adjacent devices |
| 386 | * have failed beyond what can be handled. In addition to the | 413 | * have failed beyond what can be handled. In addition to the |
| 387 | * simple case where the number of devices is a multiple of the | 414 | * simple case where the number of devices is a multiple of the |
| @@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs) | |||
| 391 | * A A B B C | 418 | * A A B B C |
| 392 | * C D D E E | 419 | * C D D E E |
| 393 | */ | 420 | */ |
| 394 | for (i = 0; i < rs->md.raid_disks * copies; i++) { | 421 | if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) { |
| 395 | if (!(i % copies)) | 422 | for (i = 0; i < rs->md.raid_disks * copies; i++) { |
| 423 | if (!(i % copies)) | ||
| 424 | rebuilds_per_group = 0; | ||
| 425 | d = i % rs->md.raid_disks; | ||
| 426 | if ((!rs->dev[d].rdev.sb_page || | ||
| 427 | !test_bit(In_sync, &rs->dev[d].rdev.flags)) && | ||
| 428 | (++rebuilds_per_group >= copies)) | ||
| 429 | goto too_many; | ||
| 430 | } | ||
| 431 | break; | ||
| 432 | } | ||
| 433 | |||
| 434 | /* | ||
| 435 | * When checking "far" and "offset" formats, we need to ensure | ||
| 436 | * that the device that holds its copy is not also dead or | ||
| 437 | * being rebuilt. (Note that "far" and "offset" formats only | ||
| 438 | * support two copies right now. These formats also only ever | ||
| 439 | * use the 'use_far_sets' variant.) | ||
| 440 | * | ||
| 441 | * This check is somewhat complicated by the need to account | ||
| 442 | * for arrays that are not a multiple of (far) copies. This | ||
| 443 | * results in the need to treat the last (potentially larger) | ||
| 444 | * set differently. | ||
| 445 | */ | ||
| 446 | group_size = (rs->md.raid_disks / copies); | ||
| 447 | last_group_start = (rs->md.raid_disks / group_size) - 1; | ||
| 448 | last_group_start *= group_size; | ||
| 449 | for (i = 0; i < rs->md.raid_disks; i++) { | ||
| 450 | if (!(i % copies) && !(i > last_group_start)) | ||
| 396 | rebuilds_per_group = 0; | 451 | rebuilds_per_group = 0; |
| 397 | d = i % rs->md.raid_disks; | 452 | if ((!rs->dev[i].rdev.sb_page || |
| 398 | if ((!rs->dev[d].rdev.sb_page || | 453 | !test_bit(In_sync, &rs->dev[i].rdev.flags)) && |
| 399 | !test_bit(In_sync, &rs->dev[d].rdev.flags)) && | ||
| 400 | (++rebuilds_per_group >= copies)) | 454 | (++rebuilds_per_group >= copies)) |
| 401 | goto too_many; | 455 | goto too_many; |
| 402 | } | 456 | } |
| 403 | break; | 457 | break; |
| 404 | default: | 458 | default: |
| @@ -433,7 +487,7 @@ too_many: | |||
| 433 | * | 487 | * |
| 434 | * RAID10-only options: | 488 | * RAID10-only options: |
| 435 | * [raid10_copies <# copies>] Number of copies. (Default: 2) | 489 | * [raid10_copies <# copies>] Number of copies. (Default: 2) |
| 436 | * [raid10_format <near>] Layout algorithm. (Default: near) | 490 | * [raid10_format <near|far|offset>] Layout algorithm. (Default: near) |
| 437 | */ | 491 | */ |
| 438 | static int parse_raid_params(struct raid_set *rs, char **argv, | 492 | static int parse_raid_params(struct raid_set *rs, char **argv, |
| 439 | unsigned num_raid_params) | 493 | unsigned num_raid_params) |
| @@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
| 520 | rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; | 574 | rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; |
| 521 | return -EINVAL; | 575 | return -EINVAL; |
| 522 | } | 576 | } |
| 523 | if (strcmp("near", argv[i])) { | 577 | if (strcmp("near", argv[i]) && |
| 578 | strcmp("far", argv[i]) && | ||
| 579 | strcmp("offset", argv[i])) { | ||
| 524 | rs->ti->error = "Invalid 'raid10_format' value given"; | 580 | rs->ti->error = "Invalid 'raid10_format' value given"; |
| 525 | return -EINVAL; | 581 | return -EINVAL; |
| 526 | } | 582 | } |
| @@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
| 644 | return -EINVAL; | 700 | return -EINVAL; |
| 645 | } | 701 | } |
| 646 | 702 | ||
| 703 | /* | ||
| 704 | * If the format is not "near", we only support | ||
| 705 | * two copies at the moment. | ||
| 706 | */ | ||
| 707 | if (strcmp("near", raid10_format) && (raid10_copies > 2)) { | ||
| 708 | rs->ti->error = "Too many copies for given RAID10 format."; | ||
| 709 | return -EINVAL; | ||
| 710 | } | ||
| 711 | |||
| 647 | /* (Len * #mirrors) / #devices */ | 712 | /* (Len * #mirrors) / #devices */ |
| 648 | sectors_per_dev = rs->ti->len * raid10_copies; | 713 | sectors_per_dev = rs->ti->len * raid10_copies; |
| 649 | sector_div(sectors_per_dev, rs->md.raid_disks); | 714 | sector_div(sectors_per_dev, rs->md.raid_disks); |
| @@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) | |||
| 854 | /* | 919 | /* |
| 855 | * Reshaping is not currently allowed | 920 | * Reshaping is not currently allowed |
| 856 | */ | 921 | */ |
| 857 | if ((le32_to_cpu(sb->level) != mddev->level) || | 922 | if (le32_to_cpu(sb->level) != mddev->level) { |
| 858 | (le32_to_cpu(sb->layout) != mddev->layout) || | 923 | DMERR("Reshaping arrays not yet supported. (RAID level change)"); |
| 859 | (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { | 924 | return -EINVAL; |
| 860 | DMERR("Reshaping arrays not yet supported."); | 925 | } |
| 926 | if (le32_to_cpu(sb->layout) != mddev->layout) { | ||
| 927 | DMERR("Reshaping arrays not yet supported. (RAID layout change)"); | ||
| 928 | DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout); | ||
| 929 | DMERR(" Old layout: %s w/ %d copies", | ||
| 930 | raid10_md_layout_to_format(le32_to_cpu(sb->layout)), | ||
| 931 | raid10_md_layout_to_copies(le32_to_cpu(sb->layout))); | ||
| 932 | DMERR(" New layout: %s w/ %d copies", | ||
| 933 | raid10_md_layout_to_format(mddev->layout), | ||
| 934 | raid10_md_layout_to_copies(mddev->layout)); | ||
| 935 | return -EINVAL; | ||
| 936 | } | ||
| 937 | if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) { | ||
| 938 | DMERR("Reshaping arrays not yet supported. (stripe sectors change)"); | ||
| 861 | return -EINVAL; | 939 | return -EINVAL; |
| 862 | } | 940 | } |
| 863 | 941 | ||
| 864 | /* We can only change the number of devices in RAID1 right now */ | 942 | /* We can only change the number of devices in RAID1 right now */ |
| 865 | if ((rs->raid_type->level != 1) && | 943 | if ((rs->raid_type->level != 1) && |
| 866 | (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { | 944 | (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { |
| 867 | DMERR("Reshaping arrays not yet supported."); | 945 | DMERR("Reshaping arrays not yet supported. (device count change)"); |
| 868 | return -EINVAL; | 946 | return -EINVAL; |
| 869 | } | 947 | } |
| 870 | 948 | ||
| @@ -1151,7 +1229,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
| 1151 | 1229 | ||
| 1152 | INIT_WORK(&rs->md.event_work, do_table_event); | 1230 | INIT_WORK(&rs->md.event_work, do_table_event); |
| 1153 | ti->private = rs; | 1231 | ti->private = rs; |
| 1154 | ti->num_flush_requests = 1; | 1232 | ti->num_flush_bios = 1; |
| 1155 | 1233 | ||
| 1156 | mutex_lock(&rs->md.reconfig_mutex); | 1234 | mutex_lock(&rs->md.reconfig_mutex); |
| 1157 | ret = md_run(&rs->md); | 1235 | ret = md_run(&rs->md); |
| @@ -1201,8 +1279,8 @@ static int raid_map(struct dm_target *ti, struct bio *bio) | |||
| 1201 | return DM_MAPIO_SUBMITTED; | 1279 | return DM_MAPIO_SUBMITTED; |
| 1202 | } | 1280 | } |
| 1203 | 1281 | ||
| 1204 | static int raid_status(struct dm_target *ti, status_type_t type, | 1282 | static void raid_status(struct dm_target *ti, status_type_t type, |
| 1205 | unsigned status_flags, char *result, unsigned maxlen) | 1283 | unsigned status_flags, char *result, unsigned maxlen) |
| 1206 | { | 1284 | { |
| 1207 | struct raid_set *rs = ti->private; | 1285 | struct raid_set *rs = ti->private; |
| 1208 | unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ | 1286 | unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ |
| @@ -1329,7 +1407,8 @@ static int raid_status(struct dm_target *ti, status_type_t type, | |||
| 1329 | raid10_md_layout_to_copies(rs->md.layout)); | 1407 | raid10_md_layout_to_copies(rs->md.layout)); |
| 1330 | 1408 | ||
| 1331 | if (rs->print_flags & DMPF_RAID10_FORMAT) | 1409 | if (rs->print_flags & DMPF_RAID10_FORMAT) |
| 1332 | DMEMIT(" raid10_format near"); | 1410 | DMEMIT(" raid10_format %s", |
| 1411 | raid10_md_layout_to_format(rs->md.layout)); | ||
| 1333 | 1412 | ||
| 1334 | DMEMIT(" %d", rs->md.raid_disks); | 1413 | DMEMIT(" %d", rs->md.raid_disks); |
| 1335 | for (i = 0; i < rs->md.raid_disks; i++) { | 1414 | for (i = 0; i < rs->md.raid_disks; i++) { |
| @@ -1344,8 +1423,6 @@ static int raid_status(struct dm_target *ti, status_type_t type, | |||
| 1344 | DMEMIT(" -"); | 1423 | DMEMIT(" -"); |
| 1345 | } | 1424 | } |
| 1346 | } | 1425 | } |
| 1347 | |||
| 1348 | return 0; | ||
| 1349 | } | 1426 | } |
| 1350 | 1427 | ||
| 1351 | static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) | 1428 | static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) |
| @@ -1405,7 +1482,7 @@ static void raid_resume(struct dm_target *ti) | |||
| 1405 | 1482 | ||
| 1406 | static struct target_type raid_target = { | 1483 | static struct target_type raid_target = { |
| 1407 | .name = "raid", | 1484 | .name = "raid", |
| 1408 | .version = {1, 4, 1}, | 1485 | .version = {1, 4, 2}, |
| 1409 | .module = THIS_MODULE, | 1486 | .module = THIS_MODULE, |
| 1410 | .ctr = raid_ctr, | 1487 | .ctr = raid_ctr, |
| 1411 | .dtr = raid_dtr, | 1488 | .dtr = raid_dtr, |
| @@ -1420,6 +1497,10 @@ static struct target_type raid_target = { | |||
| 1420 | 1497 | ||
| 1421 | static int __init dm_raid_init(void) | 1498 | static int __init dm_raid_init(void) |
| 1422 | { | 1499 | { |
| 1500 | DMINFO("Loading target version %u.%u.%u", | ||
| 1501 | raid_target.version[0], | ||
| 1502 | raid_target.version[1], | ||
| 1503 | raid_target.version[2]); | ||
| 1423 | return dm_register_target(&raid_target); | 1504 | return dm_register_target(&raid_target); |
| 1424 | } | 1505 | } |
| 1425 | 1506 | ||
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index fa519185ebba..d053098c6a91 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
| @@ -82,6 +82,9 @@ struct mirror_set { | |||
| 82 | struct mirror mirror[0]; | 82 | struct mirror mirror[0]; |
| 83 | }; | 83 | }; |
| 84 | 84 | ||
| 85 | DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(raid1_resync_throttle, | ||
| 86 | "A percentage of time allocated for raid resynchronization"); | ||
| 87 | |||
| 85 | static void wakeup_mirrord(void *context) | 88 | static void wakeup_mirrord(void *context) |
| 86 | { | 89 | { |
| 87 | struct mirror_set *ms = context; | 90 | struct mirror_set *ms = context; |
| @@ -1072,8 +1075,8 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1072 | if (r) | 1075 | if (r) |
| 1073 | goto err_free_context; | 1076 | goto err_free_context; |
| 1074 | 1077 | ||
| 1075 | ti->num_flush_requests = 1; | 1078 | ti->num_flush_bios = 1; |
| 1076 | ti->num_discard_requests = 1; | 1079 | ti->num_discard_bios = 1; |
| 1077 | ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record); | 1080 | ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record); |
| 1078 | ti->discard_zeroes_data_unsupported = true; | 1081 | ti->discard_zeroes_data_unsupported = true; |
| 1079 | 1082 | ||
| @@ -1111,7 +1114,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1111 | goto err_destroy_wq; | 1114 | goto err_destroy_wq; |
| 1112 | } | 1115 | } |
| 1113 | 1116 | ||
| 1114 | ms->kcopyd_client = dm_kcopyd_client_create(); | 1117 | ms->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); |
| 1115 | if (IS_ERR(ms->kcopyd_client)) { | 1118 | if (IS_ERR(ms->kcopyd_client)) { |
| 1116 | r = PTR_ERR(ms->kcopyd_client); | 1119 | r = PTR_ERR(ms->kcopyd_client); |
| 1117 | goto err_destroy_wq; | 1120 | goto err_destroy_wq; |
| @@ -1347,8 +1350,8 @@ static char device_status_char(struct mirror *m) | |||
| 1347 | } | 1350 | } |
| 1348 | 1351 | ||
| 1349 | 1352 | ||
| 1350 | static int mirror_status(struct dm_target *ti, status_type_t type, | 1353 | static void mirror_status(struct dm_target *ti, status_type_t type, |
| 1351 | unsigned status_flags, char *result, unsigned maxlen) | 1354 | unsigned status_flags, char *result, unsigned maxlen) |
| 1352 | { | 1355 | { |
| 1353 | unsigned int m, sz = 0; | 1356 | unsigned int m, sz = 0; |
| 1354 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1357 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
| @@ -1383,8 +1386,6 @@ static int mirror_status(struct dm_target *ti, status_type_t type, | |||
| 1383 | if (ms->features & DM_RAID1_HANDLE_ERRORS) | 1386 | if (ms->features & DM_RAID1_HANDLE_ERRORS) |
| 1384 | DMEMIT(" 1 handle_errors"); | 1387 | DMEMIT(" 1 handle_errors"); |
| 1385 | } | 1388 | } |
| 1386 | |||
| 1387 | return 0; | ||
| 1388 | } | 1389 | } |
| 1389 | 1390 | ||
| 1390 | static int mirror_iterate_devices(struct dm_target *ti, | 1391 | static int mirror_iterate_devices(struct dm_target *ti, |
| @@ -1403,7 +1404,7 @@ static int mirror_iterate_devices(struct dm_target *ti, | |||
| 1403 | 1404 | ||
| 1404 | static struct target_type mirror_target = { | 1405 | static struct target_type mirror_target = { |
| 1405 | .name = "mirror", | 1406 | .name = "mirror", |
| 1406 | .version = {1, 13, 1}, | 1407 | .version = {1, 13, 2}, |
| 1407 | .module = THIS_MODULE, | 1408 | .module = THIS_MODULE, |
| 1408 | .ctr = mirror_ctr, | 1409 | .ctr = mirror_ctr, |
| 1409 | .dtr = mirror_dtr, | 1410 | .dtr = mirror_dtr, |
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 59fc18ae52c2..c0e07026a8d1 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c | |||
| @@ -124,6 +124,9 @@ struct dm_snapshot { | |||
| 124 | #define RUNNING_MERGE 0 | 124 | #define RUNNING_MERGE 0 |
| 125 | #define SHUTDOWN_MERGE 1 | 125 | #define SHUTDOWN_MERGE 1 |
| 126 | 126 | ||
| 127 | DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, | ||
| 128 | "A percentage of time allocated for copy on write"); | ||
| 129 | |||
| 127 | struct dm_dev *dm_snap_origin(struct dm_snapshot *s) | 130 | struct dm_dev *dm_snap_origin(struct dm_snapshot *s) |
| 128 | { | 131 | { |
| 129 | return s->origin; | 132 | return s->origin; |
| @@ -227,12 +230,11 @@ static void stop_tracking_chunk(struct dm_snapshot *s, struct bio *bio) | |||
| 227 | static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) | 230 | static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) |
| 228 | { | 231 | { |
| 229 | struct dm_snap_tracked_chunk *c; | 232 | struct dm_snap_tracked_chunk *c; |
| 230 | struct hlist_node *hn; | ||
| 231 | int found = 0; | 233 | int found = 0; |
| 232 | 234 | ||
| 233 | spin_lock_irq(&s->tracked_chunk_lock); | 235 | spin_lock_irq(&s->tracked_chunk_lock); |
| 234 | 236 | ||
| 235 | hlist_for_each_entry(c, hn, | 237 | hlist_for_each_entry(c, |
| 236 | &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) { | 238 | &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) { |
| 237 | if (c->chunk == chunk) { | 239 | if (c->chunk == chunk) { |
| 238 | found = 1; | 240 | found = 1; |
| @@ -1038,7 +1040,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1038 | int i; | 1040 | int i; |
| 1039 | int r = -EINVAL; | 1041 | int r = -EINVAL; |
| 1040 | char *origin_path, *cow_path; | 1042 | char *origin_path, *cow_path; |
| 1041 | unsigned args_used, num_flush_requests = 1; | 1043 | unsigned args_used, num_flush_bios = 1; |
| 1042 | fmode_t origin_mode = FMODE_READ; | 1044 | fmode_t origin_mode = FMODE_READ; |
| 1043 | 1045 | ||
| 1044 | if (argc != 4) { | 1046 | if (argc != 4) { |
| @@ -1048,7 +1050,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1048 | } | 1050 | } |
| 1049 | 1051 | ||
| 1050 | if (dm_target_is_snapshot_merge(ti)) { | 1052 | if (dm_target_is_snapshot_merge(ti)) { |
| 1051 | num_flush_requests = 2; | 1053 | num_flush_bios = 2; |
| 1052 | origin_mode = FMODE_WRITE; | 1054 | origin_mode = FMODE_WRITE; |
| 1053 | } | 1055 | } |
| 1054 | 1056 | ||
| @@ -1109,7 +1111,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1109 | goto bad_hash_tables; | 1111 | goto bad_hash_tables; |
| 1110 | } | 1112 | } |
| 1111 | 1113 | ||
| 1112 | s->kcopyd_client = dm_kcopyd_client_create(); | 1114 | s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); |
| 1113 | if (IS_ERR(s->kcopyd_client)) { | 1115 | if (IS_ERR(s->kcopyd_client)) { |
| 1114 | r = PTR_ERR(s->kcopyd_client); | 1116 | r = PTR_ERR(s->kcopyd_client); |
| 1115 | ti->error = "Could not create kcopyd client"; | 1117 | ti->error = "Could not create kcopyd client"; |
| @@ -1128,7 +1130,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 1128 | spin_lock_init(&s->tracked_chunk_lock); | 1130 | spin_lock_init(&s->tracked_chunk_lock); |
| 1129 | 1131 | ||
| 1130 | ti->private = s; | 1132 | ti->private = s; |
| 1131 | ti->num_flush_requests = num_flush_requests; | 1133 | ti->num_flush_bios = num_flush_bios; |
| 1132 | ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk); | 1134 | ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk); |
| 1133 | 1135 | ||
| 1134 | /* Add snapshot to the list of snapshots for this origin */ | 1136 | /* Add snapshot to the list of snapshots for this origin */ |
| @@ -1692,7 +1694,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) | |||
| 1692 | init_tracked_chunk(bio); | 1694 | init_tracked_chunk(bio); |
| 1693 | 1695 | ||
| 1694 | if (bio->bi_rw & REQ_FLUSH) { | 1696 | if (bio->bi_rw & REQ_FLUSH) { |
| 1695 | if (!dm_bio_get_target_request_nr(bio)) | 1697 | if (!dm_bio_get_target_bio_nr(bio)) |
| 1696 | bio->bi_bdev = s->origin->bdev; | 1698 | bio->bi_bdev = s->origin->bdev; |
| 1697 | else | 1699 | else |
| 1698 | bio->bi_bdev = s->cow->bdev; | 1700 | bio->bi_bdev = s->cow->bdev; |
| @@ -1837,8 +1839,8 @@ static void snapshot_merge_resume(struct dm_target *ti) | |||
| 1837 | start_merge(s); | 1839 | start_merge(s); |
| 1838 | } | 1840 | } |
| 1839 | 1841 | ||
| 1840 | static int snapshot_status(struct dm_target *ti, status_type_t type, | 1842 | static void snapshot_status(struct dm_target *ti, status_type_t type, |
| 1841 | unsigned status_flags, char *result, unsigned maxlen) | 1843 | unsigned status_flags, char *result, unsigned maxlen) |
| 1842 | { | 1844 | { |
| 1843 | unsigned sz = 0; | 1845 | unsigned sz = 0; |
| 1844 | struct dm_snapshot *snap = ti->private; | 1846 | struct dm_snapshot *snap = ti->private; |
| @@ -1884,8 +1886,6 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, | |||
| 1884 | maxlen - sz); | 1886 | maxlen - sz); |
| 1885 | break; | 1887 | break; |
| 1886 | } | 1888 | } |
| 1887 | |||
| 1888 | return 0; | ||
| 1889 | } | 1889 | } |
| 1890 | 1890 | ||
| 1891 | static int snapshot_iterate_devices(struct dm_target *ti, | 1891 | static int snapshot_iterate_devices(struct dm_target *ti, |
| @@ -2105,7 +2105,7 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 2105 | } | 2105 | } |
| 2106 | 2106 | ||
| 2107 | ti->private = dev; | 2107 | ti->private = dev; |
| 2108 | ti->num_flush_requests = 1; | 2108 | ti->num_flush_bios = 1; |
| 2109 | 2109 | ||
| 2110 | return 0; | 2110 | return 0; |
| 2111 | } | 2111 | } |
| @@ -2139,8 +2139,8 @@ static void origin_resume(struct dm_target *ti) | |||
| 2139 | ti->max_io_len = get_origin_minimum_chunksize(dev->bdev); | 2139 | ti->max_io_len = get_origin_minimum_chunksize(dev->bdev); |
| 2140 | } | 2140 | } |
| 2141 | 2141 | ||
| 2142 | static int origin_status(struct dm_target *ti, status_type_t type, | 2142 | static void origin_status(struct dm_target *ti, status_type_t type, |
| 2143 | unsigned status_flags, char *result, unsigned maxlen) | 2143 | unsigned status_flags, char *result, unsigned maxlen) |
| 2144 | { | 2144 | { |
| 2145 | struct dm_dev *dev = ti->private; | 2145 | struct dm_dev *dev = ti->private; |
| 2146 | 2146 | ||
| @@ -2153,8 +2153,6 @@ static int origin_status(struct dm_target *ti, status_type_t type, | |||
| 2153 | snprintf(result, maxlen, "%s", dev->name); | 2153 | snprintf(result, maxlen, "%s", dev->name); |
| 2154 | break; | 2154 | break; |
| 2155 | } | 2155 | } |
| 2156 | |||
| 2157 | return 0; | ||
| 2158 | } | 2156 | } |
| 2159 | 2157 | ||
| 2160 | static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | 2158 | static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, |
| @@ -2181,7 +2179,7 @@ static int origin_iterate_devices(struct dm_target *ti, | |||
| 2181 | 2179 | ||
| 2182 | static struct target_type origin_target = { | 2180 | static struct target_type origin_target = { |
| 2183 | .name = "snapshot-origin", | 2181 | .name = "snapshot-origin", |
| 2184 | .version = {1, 8, 0}, | 2182 | .version = {1, 8, 1}, |
| 2185 | .module = THIS_MODULE, | 2183 | .module = THIS_MODULE, |
| 2186 | .ctr = origin_ctr, | 2184 | .ctr = origin_ctr, |
| 2187 | .dtr = origin_dtr, | 2185 | .dtr = origin_dtr, |
| @@ -2194,7 +2192,7 @@ static struct target_type origin_target = { | |||
| 2194 | 2192 | ||
| 2195 | static struct target_type snapshot_target = { | 2193 | static struct target_type snapshot_target = { |
| 2196 | .name = "snapshot", | 2194 | .name = "snapshot", |
| 2197 | .version = {1, 11, 0}, | 2195 | .version = {1, 11, 1}, |
| 2198 | .module = THIS_MODULE, | 2196 | .module = THIS_MODULE, |
| 2199 | .ctr = snapshot_ctr, | 2197 | .ctr = snapshot_ctr, |
| 2200 | .dtr = snapshot_dtr, | 2198 | .dtr = snapshot_dtr, |
| @@ -2307,3 +2305,5 @@ module_exit(dm_snapshot_exit); | |||
| 2307 | MODULE_DESCRIPTION(DM_NAME " snapshot target"); | 2305 | MODULE_DESCRIPTION(DM_NAME " snapshot target"); |
| 2308 | MODULE_AUTHOR("Joe Thornber"); | 2306 | MODULE_AUTHOR("Joe Thornber"); |
| 2309 | MODULE_LICENSE("GPL"); | 2307 | MODULE_LICENSE("GPL"); |
| 2308 | MODULE_ALIAS("dm-snapshot-origin"); | ||
| 2309 | MODULE_ALIAS("dm-snapshot-merge"); | ||
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index c89cde86d400..d8837d313f54 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
| @@ -160,9 +160,9 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 160 | if (r) | 160 | if (r) |
| 161 | return r; | 161 | return r; |
| 162 | 162 | ||
| 163 | ti->num_flush_requests = stripes; | 163 | ti->num_flush_bios = stripes; |
| 164 | ti->num_discard_requests = stripes; | 164 | ti->num_discard_bios = stripes; |
| 165 | ti->num_write_same_requests = stripes; | 165 | ti->num_write_same_bios = stripes; |
| 166 | 166 | ||
| 167 | sc->chunk_size = chunk_size; | 167 | sc->chunk_size = chunk_size; |
| 168 | if (chunk_size & (chunk_size - 1)) | 168 | if (chunk_size & (chunk_size - 1)) |
| @@ -276,19 +276,19 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) | |||
| 276 | { | 276 | { |
| 277 | struct stripe_c *sc = ti->private; | 277 | struct stripe_c *sc = ti->private; |
| 278 | uint32_t stripe; | 278 | uint32_t stripe; |
| 279 | unsigned target_request_nr; | 279 | unsigned target_bio_nr; |
| 280 | 280 | ||
| 281 | if (bio->bi_rw & REQ_FLUSH) { | 281 | if (bio->bi_rw & REQ_FLUSH) { |
| 282 | target_request_nr = dm_bio_get_target_request_nr(bio); | 282 | target_bio_nr = dm_bio_get_target_bio_nr(bio); |
| 283 | BUG_ON(target_request_nr >= sc->stripes); | 283 | BUG_ON(target_bio_nr >= sc->stripes); |
| 284 | bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev; | 284 | bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev; |
| 285 | return DM_MAPIO_REMAPPED; | 285 | return DM_MAPIO_REMAPPED; |
| 286 | } | 286 | } |
| 287 | if (unlikely(bio->bi_rw & REQ_DISCARD) || | 287 | if (unlikely(bio->bi_rw & REQ_DISCARD) || |
| 288 | unlikely(bio->bi_rw & REQ_WRITE_SAME)) { | 288 | unlikely(bio->bi_rw & REQ_WRITE_SAME)) { |
| 289 | target_request_nr = dm_bio_get_target_request_nr(bio); | 289 | target_bio_nr = dm_bio_get_target_bio_nr(bio); |
| 290 | BUG_ON(target_request_nr >= sc->stripes); | 290 | BUG_ON(target_bio_nr >= sc->stripes); |
| 291 | return stripe_map_range(sc, bio, target_request_nr); | 291 | return stripe_map_range(sc, bio, target_bio_nr); |
| 292 | } | 292 | } |
| 293 | 293 | ||
| 294 | stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector); | 294 | stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector); |
| @@ -312,8 +312,8 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) | |||
| 312 | * | 312 | * |
| 313 | */ | 313 | */ |
| 314 | 314 | ||
| 315 | static int stripe_status(struct dm_target *ti, status_type_t type, | 315 | static void stripe_status(struct dm_target *ti, status_type_t type, |
| 316 | unsigned status_flags, char *result, unsigned maxlen) | 316 | unsigned status_flags, char *result, unsigned maxlen) |
| 317 | { | 317 | { |
| 318 | struct stripe_c *sc = (struct stripe_c *) ti->private; | 318 | struct stripe_c *sc = (struct stripe_c *) ti->private; |
| 319 | char buffer[sc->stripes + 1]; | 319 | char buffer[sc->stripes + 1]; |
| @@ -340,7 +340,6 @@ static int stripe_status(struct dm_target *ti, status_type_t type, | |||
| 340 | (unsigned long long)sc->stripe[i].physical_start); | 340 | (unsigned long long)sc->stripe[i].physical_start); |
| 341 | break; | 341 | break; |
| 342 | } | 342 | } |
| 343 | return 0; | ||
| 344 | } | 343 | } |
| 345 | 344 | ||
| 346 | static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error) | 345 | static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error) |
| @@ -428,7 +427,7 @@ static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |||
| 428 | 427 | ||
| 429 | static struct target_type stripe_target = { | 428 | static struct target_type stripe_target = { |
| 430 | .name = "striped", | 429 | .name = "striped", |
| 431 | .version = {1, 5, 0}, | 430 | .version = {1, 5, 1}, |
| 432 | .module = THIS_MODULE, | 431 | .module = THIS_MODULE, |
| 433 | .ctr = stripe_ctr, | 432 | .ctr = stripe_ctr, |
| 434 | .dtr = stripe_dtr, | 433 | .dtr = stripe_dtr, |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index daf25d0890b3..e50dad0c65f4 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
| @@ -217,7 +217,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode, | |||
| 217 | 217 | ||
| 218 | if (alloc_targets(t, num_targets)) { | 218 | if (alloc_targets(t, num_targets)) { |
| 219 | kfree(t); | 219 | kfree(t); |
| 220 | t = NULL; | ||
| 221 | return -ENOMEM; | 220 | return -ENOMEM; |
| 222 | } | 221 | } |
| 223 | 222 | ||
| @@ -823,8 +822,8 @@ int dm_table_add_target(struct dm_table *t, const char *type, | |||
| 823 | 822 | ||
| 824 | t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; | 823 | t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; |
| 825 | 824 | ||
| 826 | if (!tgt->num_discard_requests && tgt->discards_supported) | 825 | if (!tgt->num_discard_bios && tgt->discards_supported) |
| 827 | DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.", | 826 | DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.", |
| 828 | dm_device_name(t->md), type); | 827 | dm_device_name(t->md), type); |
| 829 | 828 | ||
| 830 | return 0; | 829 | return 0; |
| @@ -1360,7 +1359,7 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) | |||
| 1360 | while (i < dm_table_get_num_targets(t)) { | 1359 | while (i < dm_table_get_num_targets(t)) { |
| 1361 | ti = dm_table_get_target(t, i++); | 1360 | ti = dm_table_get_target(t, i++); |
| 1362 | 1361 | ||
| 1363 | if (!ti->num_flush_requests) | 1362 | if (!ti->num_flush_bios) |
| 1364 | continue; | 1363 | continue; |
| 1365 | 1364 | ||
| 1366 | if (ti->flush_supported) | 1365 | if (ti->flush_supported) |
| @@ -1439,7 +1438,7 @@ static bool dm_table_supports_write_same(struct dm_table *t) | |||
| 1439 | while (i < dm_table_get_num_targets(t)) { | 1438 | while (i < dm_table_get_num_targets(t)) { |
| 1440 | ti = dm_table_get_target(t, i++); | 1439 | ti = dm_table_get_target(t, i++); |
| 1441 | 1440 | ||
| 1442 | if (!ti->num_write_same_requests) | 1441 | if (!ti->num_write_same_bios) |
| 1443 | return false; | 1442 | return false; |
| 1444 | 1443 | ||
| 1445 | if (!ti->type->iterate_devices || | 1444 | if (!ti->type->iterate_devices || |
| @@ -1657,7 +1656,7 @@ bool dm_table_supports_discards(struct dm_table *t) | |||
| 1657 | while (i < dm_table_get_num_targets(t)) { | 1656 | while (i < dm_table_get_num_targets(t)) { |
| 1658 | ti = dm_table_get_target(t, i++); | 1657 | ti = dm_table_get_target(t, i++); |
| 1659 | 1658 | ||
| 1660 | if (!ti->num_discard_requests) | 1659 | if (!ti->num_discard_bios) |
| 1661 | continue; | 1660 | continue; |
| 1662 | 1661 | ||
| 1663 | if (ti->discards_supported) | 1662 | if (ti->discards_supported) |
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 617d21a77256..37ba5db71cd9 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c | |||
| @@ -116,7 +116,7 @@ static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args) | |||
| 116 | /* | 116 | /* |
| 117 | * Return error for discards instead of -EOPNOTSUPP | 117 | * Return error for discards instead of -EOPNOTSUPP |
| 118 | */ | 118 | */ |
| 119 | tt->num_discard_requests = 1; | 119 | tt->num_discard_bios = 1; |
| 120 | 120 | ||
| 121 | return 0; | 121 | return 0; |
| 122 | } | 122 | } |
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 4d6e85367b84..00cee02f8fc9 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c | |||
| @@ -280,7 +280,7 @@ static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t) | |||
| 280 | *t = v & ((1 << 24) - 1); | 280 | *t = v & ((1 << 24) - 1); |
| 281 | } | 281 | } |
| 282 | 282 | ||
| 283 | static void data_block_inc(void *context, void *value_le) | 283 | static void data_block_inc(void *context, const void *value_le) |
| 284 | { | 284 | { |
| 285 | struct dm_space_map *sm = context; | 285 | struct dm_space_map *sm = context; |
| 286 | __le64 v_le; | 286 | __le64 v_le; |
| @@ -292,7 +292,7 @@ static void data_block_inc(void *context, void *value_le) | |||
| 292 | dm_sm_inc_block(sm, b); | 292 | dm_sm_inc_block(sm, b); |
| 293 | } | 293 | } |
| 294 | 294 | ||
| 295 | static void data_block_dec(void *context, void *value_le) | 295 | static void data_block_dec(void *context, const void *value_le) |
| 296 | { | 296 | { |
| 297 | struct dm_space_map *sm = context; | 297 | struct dm_space_map *sm = context; |
| 298 | __le64 v_le; | 298 | __le64 v_le; |
| @@ -304,7 +304,7 @@ static void data_block_dec(void *context, void *value_le) | |||
| 304 | dm_sm_dec_block(sm, b); | 304 | dm_sm_dec_block(sm, b); |
| 305 | } | 305 | } |
| 306 | 306 | ||
| 307 | static int data_block_equal(void *context, void *value1_le, void *value2_le) | 307 | static int data_block_equal(void *context, const void *value1_le, const void *value2_le) |
| 308 | { | 308 | { |
| 309 | __le64 v1_le, v2_le; | 309 | __le64 v1_le, v2_le; |
| 310 | uint64_t b1, b2; | 310 | uint64_t b1, b2; |
| @@ -318,7 +318,7 @@ static int data_block_equal(void *context, void *value1_le, void *value2_le) | |||
| 318 | return b1 == b2; | 318 | return b1 == b2; |
| 319 | } | 319 | } |
| 320 | 320 | ||
| 321 | static void subtree_inc(void *context, void *value) | 321 | static void subtree_inc(void *context, const void *value) |
| 322 | { | 322 | { |
| 323 | struct dm_btree_info *info = context; | 323 | struct dm_btree_info *info = context; |
| 324 | __le64 root_le; | 324 | __le64 root_le; |
| @@ -329,7 +329,7 @@ static void subtree_inc(void *context, void *value) | |||
| 329 | dm_tm_inc(info->tm, root); | 329 | dm_tm_inc(info->tm, root); |
| 330 | } | 330 | } |
| 331 | 331 | ||
| 332 | static void subtree_dec(void *context, void *value) | 332 | static void subtree_dec(void *context, const void *value) |
| 333 | { | 333 | { |
| 334 | struct dm_btree_info *info = context; | 334 | struct dm_btree_info *info = context; |
| 335 | __le64 root_le; | 335 | __le64 root_le; |
| @@ -341,7 +341,7 @@ static void subtree_dec(void *context, void *value) | |||
| 341 | DMERR("btree delete failed\n"); | 341 | DMERR("btree delete failed\n"); |
| 342 | } | 342 | } |
| 343 | 343 | ||
| 344 | static int subtree_equal(void *context, void *value1_le, void *value2_le) | 344 | static int subtree_equal(void *context, const void *value1_le, const void *value2_le) |
| 345 | { | 345 | { |
| 346 | __le64 v1_le, v2_le; | 346 | __le64 v1_le, v2_le; |
| 347 | memcpy(&v1_le, value1_le, sizeof(v1_le)); | 347 | memcpy(&v1_le, value1_le, sizeof(v1_le)); |
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 5409607d4875..009339d62828 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c | |||
| @@ -26,6 +26,9 @@ | |||
| 26 | #define PRISON_CELLS 1024 | 26 | #define PRISON_CELLS 1024 |
| 27 | #define COMMIT_PERIOD HZ | 27 | #define COMMIT_PERIOD HZ |
| 28 | 28 | ||
| 29 | DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, | ||
| 30 | "A percentage of time allocated for copy on write"); | ||
| 31 | |||
| 29 | /* | 32 | /* |
| 30 | * The block size of the device holding pool data must be | 33 | * The block size of the device holding pool data must be |
| 31 | * between 64KB and 1GB. | 34 | * between 64KB and 1GB. |
| @@ -227,6 +230,78 @@ struct thin_c { | |||
| 227 | /*----------------------------------------------------------------*/ | 230 | /*----------------------------------------------------------------*/ |
| 228 | 231 | ||
| 229 | /* | 232 | /* |
| 233 | * wake_worker() is used when new work is queued and when pool_resume is | ||
| 234 | * ready to continue deferred IO processing. | ||
| 235 | */ | ||
| 236 | static void wake_worker(struct pool *pool) | ||
| 237 | { | ||
| 238 | queue_work(pool->wq, &pool->worker); | ||
| 239 | } | ||
| 240 | |||
| 241 | /*----------------------------------------------------------------*/ | ||
| 242 | |||
| 243 | static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio, | ||
| 244 | struct dm_bio_prison_cell **cell_result) | ||
| 245 | { | ||
| 246 | int r; | ||
| 247 | struct dm_bio_prison_cell *cell_prealloc; | ||
| 248 | |||
| 249 | /* | ||
| 250 | * Allocate a cell from the prison's mempool. | ||
| 251 | * This might block but it can't fail. | ||
| 252 | */ | ||
| 253 | cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO); | ||
| 254 | |||
| 255 | r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result); | ||
| 256 | if (r) | ||
| 257 | /* | ||
| 258 | * We reused an old cell; we can get rid of | ||
| 259 | * the new one. | ||
| 260 | */ | ||
| 261 | dm_bio_prison_free_cell(pool->prison, cell_prealloc); | ||
| 262 | |||
| 263 | return r; | ||
| 264 | } | ||
| 265 | |||
| 266 | static void cell_release(struct pool *pool, | ||
| 267 | struct dm_bio_prison_cell *cell, | ||
| 268 | struct bio_list *bios) | ||
| 269 | { | ||
| 270 | dm_cell_release(pool->prison, cell, bios); | ||
| 271 | dm_bio_prison_free_cell(pool->prison, cell); | ||
| 272 | } | ||
| 273 | |||
| 274 | static void cell_release_no_holder(struct pool *pool, | ||
| 275 | struct dm_bio_prison_cell *cell, | ||
| 276 | struct bio_list *bios) | ||
| 277 | { | ||
| 278 | dm_cell_release_no_holder(pool->prison, cell, bios); | ||
| 279 | dm_bio_prison_free_cell(pool->prison, cell); | ||
| 280 | } | ||
| 281 | |||
| 282 | static void cell_defer_no_holder_no_free(struct thin_c *tc, | ||
| 283 | struct dm_bio_prison_cell *cell) | ||
| 284 | { | ||
| 285 | struct pool *pool = tc->pool; | ||
| 286 | unsigned long flags; | ||
| 287 | |||
| 288 | spin_lock_irqsave(&pool->lock, flags); | ||
| 289 | dm_cell_release_no_holder(pool->prison, cell, &pool->deferred_bios); | ||
| 290 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 291 | |||
| 292 | wake_worker(pool); | ||
| 293 | } | ||
| 294 | |||
| 295 | static void cell_error(struct pool *pool, | ||
| 296 | struct dm_bio_prison_cell *cell) | ||
| 297 | { | ||
| 298 | dm_cell_error(pool->prison, cell); | ||
| 299 | dm_bio_prison_free_cell(pool->prison, cell); | ||
| 300 | } | ||
| 301 | |||
| 302 | /*----------------------------------------------------------------*/ | ||
| 303 | |||
| 304 | /* | ||
| 230 | * A global list of pools that uses a struct mapped_device as a key. | 305 | * A global list of pools that uses a struct mapped_device as a key. |
| 231 | */ | 306 | */ |
| 232 | static struct dm_thin_pool_table { | 307 | static struct dm_thin_pool_table { |
| @@ -330,14 +405,20 @@ static void requeue_io(struct thin_c *tc) | |||
| 330 | * target. | 405 | * target. |
| 331 | */ | 406 | */ |
| 332 | 407 | ||
| 408 | static bool block_size_is_power_of_two(struct pool *pool) | ||
| 409 | { | ||
| 410 | return pool->sectors_per_block_shift >= 0; | ||
| 411 | } | ||
| 412 | |||
| 333 | static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) | 413 | static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) |
| 334 | { | 414 | { |
| 415 | struct pool *pool = tc->pool; | ||
| 335 | sector_t block_nr = bio->bi_sector; | 416 | sector_t block_nr = bio->bi_sector; |
| 336 | 417 | ||
| 337 | if (tc->pool->sectors_per_block_shift < 0) | 418 | if (block_size_is_power_of_two(pool)) |
| 338 | (void) sector_div(block_nr, tc->pool->sectors_per_block); | 419 | block_nr >>= pool->sectors_per_block_shift; |
| 339 | else | 420 | else |
| 340 | block_nr >>= tc->pool->sectors_per_block_shift; | 421 | (void) sector_div(block_nr, pool->sectors_per_block); |
| 341 | 422 | ||
| 342 | return block_nr; | 423 | return block_nr; |
| 343 | } | 424 | } |
| @@ -348,12 +429,12 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) | |||
| 348 | sector_t bi_sector = bio->bi_sector; | 429 | sector_t bi_sector = bio->bi_sector; |
| 349 | 430 | ||
| 350 | bio->bi_bdev = tc->pool_dev->bdev; | 431 | bio->bi_bdev = tc->pool_dev->bdev; |
| 351 | if (tc->pool->sectors_per_block_shift < 0) | 432 | if (block_size_is_power_of_two(pool)) |
| 352 | bio->bi_sector = (block * pool->sectors_per_block) + | ||
| 353 | sector_div(bi_sector, pool->sectors_per_block); | ||
| 354 | else | ||
| 355 | bio->bi_sector = (block << pool->sectors_per_block_shift) | | 433 | bio->bi_sector = (block << pool->sectors_per_block_shift) | |
| 356 | (bi_sector & (pool->sectors_per_block - 1)); | 434 | (bi_sector & (pool->sectors_per_block - 1)); |
| 435 | else | ||
| 436 | bio->bi_sector = (block * pool->sectors_per_block) + | ||
| 437 | sector_div(bi_sector, pool->sectors_per_block); | ||
| 357 | } | 438 | } |
| 358 | 439 | ||
| 359 | static void remap_to_origin(struct thin_c *tc, struct bio *bio) | 440 | static void remap_to_origin(struct thin_c *tc, struct bio *bio) |
| @@ -420,15 +501,6 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio, | |||
| 420 | issue(tc, bio); | 501 | issue(tc, bio); |
| 421 | } | 502 | } |
| 422 | 503 | ||
| 423 | /* | ||
| 424 | * wake_worker() is used when new work is queued and when pool_resume is | ||
| 425 | * ready to continue deferred IO processing. | ||
| 426 | */ | ||
| 427 | static void wake_worker(struct pool *pool) | ||
| 428 | { | ||
| 429 | queue_work(pool->wq, &pool->worker); | ||
| 430 | } | ||
| 431 | |||
| 432 | /*----------------------------------------------------------------*/ | 504 | /*----------------------------------------------------------------*/ |
| 433 | 505 | ||
| 434 | /* | 506 | /* |
| @@ -515,14 +587,14 @@ static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell) | |||
| 515 | unsigned long flags; | 587 | unsigned long flags; |
| 516 | 588 | ||
| 517 | spin_lock_irqsave(&pool->lock, flags); | 589 | spin_lock_irqsave(&pool->lock, flags); |
| 518 | dm_cell_release(cell, &pool->deferred_bios); | 590 | cell_release(pool, cell, &pool->deferred_bios); |
| 519 | spin_unlock_irqrestore(&tc->pool->lock, flags); | 591 | spin_unlock_irqrestore(&tc->pool->lock, flags); |
| 520 | 592 | ||
| 521 | wake_worker(pool); | 593 | wake_worker(pool); |
| 522 | } | 594 | } |
| 523 | 595 | ||
| 524 | /* | 596 | /* |
| 525 | * Same as cell_defer except it omits the original holder of the cell. | 597 | * Same as cell_defer above, except it omits the original holder of the cell. |
| 526 | */ | 598 | */ |
| 527 | static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) | 599 | static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) |
| 528 | { | 600 | { |
| @@ -530,7 +602,7 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c | |||
| 530 | unsigned long flags; | 602 | unsigned long flags; |
| 531 | 603 | ||
| 532 | spin_lock_irqsave(&pool->lock, flags); | 604 | spin_lock_irqsave(&pool->lock, flags); |
| 533 | dm_cell_release_no_holder(cell, &pool->deferred_bios); | 605 | cell_release_no_holder(pool, cell, &pool->deferred_bios); |
| 534 | spin_unlock_irqrestore(&pool->lock, flags); | 606 | spin_unlock_irqrestore(&pool->lock, flags); |
| 535 | 607 | ||
| 536 | wake_worker(pool); | 608 | wake_worker(pool); |
| @@ -540,13 +612,15 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) | |||
| 540 | { | 612 | { |
| 541 | if (m->bio) | 613 | if (m->bio) |
| 542 | m->bio->bi_end_io = m->saved_bi_end_io; | 614 | m->bio->bi_end_io = m->saved_bi_end_io; |
| 543 | dm_cell_error(m->cell); | 615 | cell_error(m->tc->pool, m->cell); |
| 544 | list_del(&m->list); | 616 | list_del(&m->list); |
| 545 | mempool_free(m, m->tc->pool->mapping_pool); | 617 | mempool_free(m, m->tc->pool->mapping_pool); |
| 546 | } | 618 | } |
| 619 | |||
| 547 | static void process_prepared_mapping(struct dm_thin_new_mapping *m) | 620 | static void process_prepared_mapping(struct dm_thin_new_mapping *m) |
| 548 | { | 621 | { |
| 549 | struct thin_c *tc = m->tc; | 622 | struct thin_c *tc = m->tc; |
| 623 | struct pool *pool = tc->pool; | ||
| 550 | struct bio *bio; | 624 | struct bio *bio; |
| 551 | int r; | 625 | int r; |
| 552 | 626 | ||
| @@ -555,7 +629,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) | |||
| 555 | bio->bi_end_io = m->saved_bi_end_io; | 629 | bio->bi_end_io = m->saved_bi_end_io; |
| 556 | 630 | ||
| 557 | if (m->err) { | 631 | if (m->err) { |
| 558 | dm_cell_error(m->cell); | 632 | cell_error(pool, m->cell); |
| 559 | goto out; | 633 | goto out; |
| 560 | } | 634 | } |
| 561 | 635 | ||
| @@ -567,7 +641,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) | |||
| 567 | r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); | 641 | r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); |
| 568 | if (r) { | 642 | if (r) { |
| 569 | DMERR_LIMIT("dm_thin_insert_block() failed"); | 643 | DMERR_LIMIT("dm_thin_insert_block() failed"); |
| 570 | dm_cell_error(m->cell); | 644 | cell_error(pool, m->cell); |
| 571 | goto out; | 645 | goto out; |
| 572 | } | 646 | } |
| 573 | 647 | ||
| @@ -585,7 +659,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) | |||
| 585 | 659 | ||
| 586 | out: | 660 | out: |
| 587 | list_del(&m->list); | 661 | list_del(&m->list); |
| 588 | mempool_free(m, tc->pool->mapping_pool); | 662 | mempool_free(m, pool->mapping_pool); |
| 589 | } | 663 | } |
| 590 | 664 | ||
| 591 | static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) | 665 | static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) |
| @@ -736,7 +810,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | |||
| 736 | if (r < 0) { | 810 | if (r < 0) { |
| 737 | mempool_free(m, pool->mapping_pool); | 811 | mempool_free(m, pool->mapping_pool); |
| 738 | DMERR_LIMIT("dm_kcopyd_copy() failed"); | 812 | DMERR_LIMIT("dm_kcopyd_copy() failed"); |
| 739 | dm_cell_error(cell); | 813 | cell_error(pool, cell); |
| 740 | } | 814 | } |
| 741 | } | 815 | } |
| 742 | } | 816 | } |
| @@ -802,7 +876,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, | |||
| 802 | if (r < 0) { | 876 | if (r < 0) { |
| 803 | mempool_free(m, pool->mapping_pool); | 877 | mempool_free(m, pool->mapping_pool); |
| 804 | DMERR_LIMIT("dm_kcopyd_zero() failed"); | 878 | DMERR_LIMIT("dm_kcopyd_zero() failed"); |
| 805 | dm_cell_error(cell); | 879 | cell_error(pool, cell); |
| 806 | } | 880 | } |
| 807 | } | 881 | } |
| 808 | } | 882 | } |
| @@ -908,13 +982,13 @@ static void retry_on_resume(struct bio *bio) | |||
| 908 | spin_unlock_irqrestore(&pool->lock, flags); | 982 | spin_unlock_irqrestore(&pool->lock, flags); |
| 909 | } | 983 | } |
| 910 | 984 | ||
| 911 | static void no_space(struct dm_bio_prison_cell *cell) | 985 | static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell) |
| 912 | { | 986 | { |
| 913 | struct bio *bio; | 987 | struct bio *bio; |
| 914 | struct bio_list bios; | 988 | struct bio_list bios; |
| 915 | 989 | ||
| 916 | bio_list_init(&bios); | 990 | bio_list_init(&bios); |
| 917 | dm_cell_release(cell, &bios); | 991 | cell_release(pool, cell, &bios); |
| 918 | 992 | ||
| 919 | while ((bio = bio_list_pop(&bios))) | 993 | while ((bio = bio_list_pop(&bios))) |
| 920 | retry_on_resume(bio); | 994 | retry_on_resume(bio); |
| @@ -932,7 +1006,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio) | |||
| 932 | struct dm_thin_new_mapping *m; | 1006 | struct dm_thin_new_mapping *m; |
| 933 | 1007 | ||
| 934 | build_virtual_key(tc->td, block, &key); | 1008 | build_virtual_key(tc->td, block, &key); |
| 935 | if (dm_bio_detain(tc->pool->prison, &key, bio, &cell)) | 1009 | if (bio_detain(tc->pool, &key, bio, &cell)) |
| 936 | return; | 1010 | return; |
| 937 | 1011 | ||
| 938 | r = dm_thin_find_block(tc->td, block, 1, &lookup_result); | 1012 | r = dm_thin_find_block(tc->td, block, 1, &lookup_result); |
| @@ -944,7 +1018,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio) | |||
| 944 | * on this block. | 1018 | * on this block. |
| 945 | */ | 1019 | */ |
| 946 | build_data_key(tc->td, lookup_result.block, &key2); | 1020 | build_data_key(tc->td, lookup_result.block, &key2); |
| 947 | if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) { | 1021 | if (bio_detain(tc->pool, &key2, bio, &cell2)) { |
| 948 | cell_defer_no_holder(tc, cell); | 1022 | cell_defer_no_holder(tc, cell); |
| 949 | break; | 1023 | break; |
| 950 | } | 1024 | } |
| @@ -1020,13 +1094,13 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, | |||
| 1020 | break; | 1094 | break; |
| 1021 | 1095 | ||
| 1022 | case -ENOSPC: | 1096 | case -ENOSPC: |
| 1023 | no_space(cell); | 1097 | no_space(tc->pool, cell); |
| 1024 | break; | 1098 | break; |
| 1025 | 1099 | ||
| 1026 | default: | 1100 | default: |
| 1027 | DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", | 1101 | DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", |
| 1028 | __func__, r); | 1102 | __func__, r); |
| 1029 | dm_cell_error(cell); | 1103 | cell_error(tc->pool, cell); |
| 1030 | break; | 1104 | break; |
| 1031 | } | 1105 | } |
| 1032 | } | 1106 | } |
| @@ -1044,7 +1118,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, | |||
| 1044 | * of being broken so we have nothing further to do here. | 1118 | * of being broken so we have nothing further to do here. |
| 1045 | */ | 1119 | */ |
| 1046 | build_data_key(tc->td, lookup_result->block, &key); | 1120 | build_data_key(tc->td, lookup_result->block, &key); |
| 1047 | if (dm_bio_detain(pool->prison, &key, bio, &cell)) | 1121 | if (bio_detain(pool, &key, bio, &cell)) |
| 1048 | return; | 1122 | return; |
| 1049 | 1123 | ||
| 1050 | if (bio_data_dir(bio) == WRITE && bio->bi_size) | 1124 | if (bio_data_dir(bio) == WRITE && bio->bi_size) |
| @@ -1065,12 +1139,13 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block | |||
| 1065 | { | 1139 | { |
| 1066 | int r; | 1140 | int r; |
| 1067 | dm_block_t data_block; | 1141 | dm_block_t data_block; |
| 1142 | struct pool *pool = tc->pool; | ||
| 1068 | 1143 | ||
| 1069 | /* | 1144 | /* |
| 1070 | * Remap empty bios (flushes) immediately, without provisioning. | 1145 | * Remap empty bios (flushes) immediately, without provisioning. |
| 1071 | */ | 1146 | */ |
| 1072 | if (!bio->bi_size) { | 1147 | if (!bio->bi_size) { |
| 1073 | inc_all_io_entry(tc->pool, bio); | 1148 | inc_all_io_entry(pool, bio); |
| 1074 | cell_defer_no_holder(tc, cell); | 1149 | cell_defer_no_holder(tc, cell); |
| 1075 | 1150 | ||
| 1076 | remap_and_issue(tc, bio, 0); | 1151 | remap_and_issue(tc, bio, 0); |
| @@ -1097,14 +1172,14 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block | |||
| 1097 | break; | 1172 | break; |
| 1098 | 1173 | ||
| 1099 | case -ENOSPC: | 1174 | case -ENOSPC: |
| 1100 | no_space(cell); | 1175 | no_space(pool, cell); |
| 1101 | break; | 1176 | break; |
| 1102 | 1177 | ||
| 1103 | default: | 1178 | default: |
| 1104 | DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", | 1179 | DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", |
| 1105 | __func__, r); | 1180 | __func__, r); |
| 1106 | set_pool_mode(tc->pool, PM_READ_ONLY); | 1181 | set_pool_mode(pool, PM_READ_ONLY); |
| 1107 | dm_cell_error(cell); | 1182 | cell_error(pool, cell); |
| 1108 | break; | 1183 | break; |
| 1109 | } | 1184 | } |
| 1110 | } | 1185 | } |
| @@ -1112,6 +1187,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block | |||
| 1112 | static void process_bio(struct thin_c *tc, struct bio *bio) | 1187 | static void process_bio(struct thin_c *tc, struct bio *bio) |
| 1113 | { | 1188 | { |
| 1114 | int r; | 1189 | int r; |
| 1190 | struct pool *pool = tc->pool; | ||
| 1115 | dm_block_t block = get_bio_block(tc, bio); | 1191 | dm_block_t block = get_bio_block(tc, bio); |
| 1116 | struct dm_bio_prison_cell *cell; | 1192 | struct dm_bio_prison_cell *cell; |
| 1117 | struct dm_cell_key key; | 1193 | struct dm_cell_key key; |
| @@ -1122,7 +1198,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio) | |||
| 1122 | * being provisioned so we have nothing further to do here. | 1198 | * being provisioned so we have nothing further to do here. |
| 1123 | */ | 1199 | */ |
| 1124 | build_virtual_key(tc->td, block, &key); | 1200 | build_virtual_key(tc->td, block, &key); |
| 1125 | if (dm_bio_detain(tc->pool->prison, &key, bio, &cell)) | 1201 | if (bio_detain(pool, &key, bio, &cell)) |
| 1126 | return; | 1202 | return; |
| 1127 | 1203 | ||
| 1128 | r = dm_thin_find_block(tc->td, block, 1, &lookup_result); | 1204 | r = dm_thin_find_block(tc->td, block, 1, &lookup_result); |
| @@ -1130,9 +1206,9 @@ static void process_bio(struct thin_c *tc, struct bio *bio) | |||
| 1130 | case 0: | 1206 | case 0: |
| 1131 | if (lookup_result.shared) { | 1207 | if (lookup_result.shared) { |
| 1132 | process_shared_bio(tc, bio, block, &lookup_result); | 1208 | process_shared_bio(tc, bio, block, &lookup_result); |
| 1133 | cell_defer_no_holder(tc, cell); | 1209 | cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */ |
| 1134 | } else { | 1210 | } else { |
| 1135 | inc_all_io_entry(tc->pool, bio); | 1211 | inc_all_io_entry(pool, bio); |
| 1136 | cell_defer_no_holder(tc, cell); | 1212 | cell_defer_no_holder(tc, cell); |
| 1137 | 1213 | ||
| 1138 | remap_and_issue(tc, bio, lookup_result.block); | 1214 | remap_and_issue(tc, bio, lookup_result.block); |
| @@ -1141,7 +1217,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio) | |||
| 1141 | 1217 | ||
| 1142 | case -ENODATA: | 1218 | case -ENODATA: |
| 1143 | if (bio_data_dir(bio) == READ && tc->origin_dev) { | 1219 | if (bio_data_dir(bio) == READ && tc->origin_dev) { |
| 1144 | inc_all_io_entry(tc->pool, bio); | 1220 | inc_all_io_entry(pool, bio); |
| 1145 | cell_defer_no_holder(tc, cell); | 1221 | cell_defer_no_holder(tc, cell); |
| 1146 | 1222 | ||
| 1147 | remap_to_origin_and_issue(tc, bio); | 1223 | remap_to_origin_and_issue(tc, bio); |
| @@ -1378,7 +1454,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) | |||
| 1378 | dm_block_t block = get_bio_block(tc, bio); | 1454 | dm_block_t block = get_bio_block(tc, bio); |
| 1379 | struct dm_thin_device *td = tc->td; | 1455 | struct dm_thin_device *td = tc->td; |
| 1380 | struct dm_thin_lookup_result result; | 1456 | struct dm_thin_lookup_result result; |
| 1381 | struct dm_bio_prison_cell *cell1, *cell2; | 1457 | struct dm_bio_prison_cell cell1, cell2; |
| 1458 | struct dm_bio_prison_cell *cell_result; | ||
| 1382 | struct dm_cell_key key; | 1459 | struct dm_cell_key key; |
| 1383 | 1460 | ||
| 1384 | thin_hook_bio(tc, bio); | 1461 | thin_hook_bio(tc, bio); |
| @@ -1420,18 +1497,18 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) | |||
| 1420 | } | 1497 | } |
| 1421 | 1498 | ||
| 1422 | build_virtual_key(tc->td, block, &key); | 1499 | build_virtual_key(tc->td, block, &key); |
| 1423 | if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1)) | 1500 | if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) |
| 1424 | return DM_MAPIO_SUBMITTED; | 1501 | return DM_MAPIO_SUBMITTED; |
| 1425 | 1502 | ||
| 1426 | build_data_key(tc->td, result.block, &key); | 1503 | build_data_key(tc->td, result.block, &key); |
| 1427 | if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2)) { | 1504 | if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) { |
| 1428 | cell_defer_no_holder(tc, cell1); | 1505 | cell_defer_no_holder_no_free(tc, &cell1); |
| 1429 | return DM_MAPIO_SUBMITTED; | 1506 | return DM_MAPIO_SUBMITTED; |
| 1430 | } | 1507 | } |
| 1431 | 1508 | ||
| 1432 | inc_all_io_entry(tc->pool, bio); | 1509 | inc_all_io_entry(tc->pool, bio); |
| 1433 | cell_defer_no_holder(tc, cell2); | 1510 | cell_defer_no_holder_no_free(tc, &cell2); |
| 1434 | cell_defer_no_holder(tc, cell1); | 1511 | cell_defer_no_holder_no_free(tc, &cell1); |
| 1435 | 1512 | ||
| 1436 | remap(tc, bio, result.block); | 1513 | remap(tc, bio, result.block); |
| 1437 | return DM_MAPIO_REMAPPED; | 1514 | return DM_MAPIO_REMAPPED; |
| @@ -1636,7 +1713,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, | |||
| 1636 | goto bad_prison; | 1713 | goto bad_prison; |
| 1637 | } | 1714 | } |
| 1638 | 1715 | ||
| 1639 | pool->copier = dm_kcopyd_client_create(); | 1716 | pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); |
| 1640 | if (IS_ERR(pool->copier)) { | 1717 | if (IS_ERR(pool->copier)) { |
| 1641 | r = PTR_ERR(pool->copier); | 1718 | r = PTR_ERR(pool->copier); |
| 1642 | *error = "Error creating pool's kcopyd client"; | 1719 | *error = "Error creating pool's kcopyd client"; |
| @@ -1938,7 +2015,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
| 1938 | pt->data_dev = data_dev; | 2015 | pt->data_dev = data_dev; |
| 1939 | pt->low_water_blocks = low_water_blocks; | 2016 | pt->low_water_blocks = low_water_blocks; |
| 1940 | pt->adjusted_pf = pt->requested_pf = pf; | 2017 | pt->adjusted_pf = pt->requested_pf = pf; |
| 1941 | ti->num_flush_requests = 1; | 2018 | ti->num_flush_bios = 1; |
| 1942 | 2019 | ||
| 1943 | /* | 2020 | /* |
| 1944 | * Only need to enable discards if the pool should pass | 2021 | * Only need to enable discards if the pool should pass |
| @@ -1946,7 +2023,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
| 1946 | * processing will cause mappings to be removed from the btree. | 2023 | * processing will cause mappings to be removed from the btree. |
| 1947 | */ | 2024 | */ |
| 1948 | if (pf.discard_enabled && pf.discard_passdown) { | 2025 | if (pf.discard_enabled && pf.discard_passdown) { |
| 1949 | ti->num_discard_requests = 1; | 2026 | ti->num_discard_bios = 1; |
| 1950 | 2027 | ||
| 1951 | /* | 2028 | /* |
| 1952 | * Setting 'discards_supported' circumvents the normal | 2029 | * Setting 'discards_supported' circumvents the normal |
| @@ -2299,8 +2376,8 @@ static void emit_flags(struct pool_features *pf, char *result, | |||
| 2299 | * <transaction id> <used metadata sectors>/<total metadata sectors> | 2376 | * <transaction id> <used metadata sectors>/<total metadata sectors> |
| 2300 | * <used data sectors>/<total data sectors> <held metadata root> | 2377 | * <used data sectors>/<total data sectors> <held metadata root> |
| 2301 | */ | 2378 | */ |
| 2302 | static int pool_status(struct dm_target *ti, status_type_t type, | 2379 | static void pool_status(struct dm_target *ti, status_type_t type, |
| 2303 | unsigned status_flags, char *result, unsigned maxlen) | 2380 | unsigned status_flags, char *result, unsigned maxlen) |
| 2304 | { | 2381 | { |
| 2305 | int r; | 2382 | int r; |
| 2306 | unsigned sz = 0; | 2383 | unsigned sz = 0; |
| @@ -2326,32 +2403,41 @@ static int pool_status(struct dm_target *ti, status_type_t type, | |||
| 2326 | if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) | 2403 | if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) |
| 2327 | (void) commit_or_fallback(pool); | 2404 | (void) commit_or_fallback(pool); |
| 2328 | 2405 | ||
| 2329 | r = dm_pool_get_metadata_transaction_id(pool->pmd, | 2406 | r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); |
| 2330 | &transaction_id); | 2407 | if (r) { |
| 2331 | if (r) | 2408 | DMERR("dm_pool_get_metadata_transaction_id returned %d", r); |
| 2332 | return r; | 2409 | goto err; |
| 2410 | } | ||
| 2333 | 2411 | ||
| 2334 | r = dm_pool_get_free_metadata_block_count(pool->pmd, | 2412 | r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata); |
| 2335 | &nr_free_blocks_metadata); | 2413 | if (r) { |
| 2336 | if (r) | 2414 | DMERR("dm_pool_get_free_metadata_block_count returned %d", r); |
| 2337 | return r; | 2415 | goto err; |
| 2416 | } | ||
| 2338 | 2417 | ||
| 2339 | r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); | 2418 | r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); |
| 2340 | if (r) | 2419 | if (r) { |
| 2341 | return r; | 2420 | DMERR("dm_pool_get_metadata_dev_size returned %d", r); |
| 2421 | goto err; | ||
| 2422 | } | ||
| 2342 | 2423 | ||
| 2343 | r = dm_pool_get_free_block_count(pool->pmd, | 2424 | r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data); |
| 2344 | &nr_free_blocks_data); | 2425 | if (r) { |
| 2345 | if (r) | 2426 | DMERR("dm_pool_get_free_block_count returned %d", r); |
| 2346 | return r; | 2427 | goto err; |
| 2428 | } | ||
| 2347 | 2429 | ||
| 2348 | r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); | 2430 | r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); |
| 2349 | if (r) | 2431 | if (r) { |
| 2350 | return r; | 2432 | DMERR("dm_pool_get_data_dev_size returned %d", r); |
| 2433 | goto err; | ||
| 2434 | } | ||
| 2351 | 2435 | ||
| 2352 | r = dm_pool_get_metadata_snap(pool->pmd, &held_root); | 2436 | r = dm_pool_get_metadata_snap(pool->pmd, &held_root); |
| 2353 | if (r) | 2437 | if (r) { |
| 2354 | return r; | 2438 | DMERR("dm_pool_get_metadata_snap returned %d", r); |
| 2439 | goto err; | ||
| 2440 | } | ||
| 2355 | 2441 | ||
| 2356 | DMEMIT("%llu %llu/%llu %llu/%llu ", | 2442 | DMEMIT("%llu %llu/%llu %llu/%llu ", |
| 2357 | (unsigned long long)transaction_id, | 2443 | (unsigned long long)transaction_id, |
| @@ -2388,8 +2474,10 @@ static int pool_status(struct dm_target *ti, status_type_t type, | |||
| 2388 | emit_flags(&pt->requested_pf, result, sz, maxlen); | 2474 | emit_flags(&pt->requested_pf, result, sz, maxlen); |
| 2389 | break; | 2475 | break; |
| 2390 | } | 2476 | } |
| 2477 | return; | ||
| 2391 | 2478 | ||
| 2392 | return 0; | 2479 | err: |
| 2480 | DMEMIT("Error"); | ||
| 2393 | } | 2481 | } |
| 2394 | 2482 | ||
| 2395 | static int pool_iterate_devices(struct dm_target *ti, | 2483 | static int pool_iterate_devices(struct dm_target *ti, |
| @@ -2414,11 +2502,6 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |||
| 2414 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | 2502 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); |
| 2415 | } | 2503 | } |
| 2416 | 2504 | ||
| 2417 | static bool block_size_is_power_of_two(struct pool *pool) | ||
| 2418 | { | ||
| 2419 | return pool->sectors_per_block_shift >= 0; | ||
| 2420 | } | ||
| 2421 | |||
| 2422 | static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) | 2505 | static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) |
| 2423 | { | 2506 | { |
| 2424 | struct pool *pool = pt->pool; | 2507 | struct pool *pool = pt->pool; |
| @@ -2432,15 +2515,8 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) | |||
| 2432 | if (pt->adjusted_pf.discard_passdown) { | 2515 | if (pt->adjusted_pf.discard_passdown) { |
| 2433 | data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; | 2516 | data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; |
| 2434 | limits->discard_granularity = data_limits->discard_granularity; | 2517 | limits->discard_granularity = data_limits->discard_granularity; |
| 2435 | } else if (block_size_is_power_of_two(pool)) | 2518 | } else |
| 2436 | limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; | 2519 | limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; |
| 2437 | else | ||
| 2438 | /* | ||
| 2439 | * Use largest power of 2 that is a factor of sectors_per_block | ||
| 2440 | * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS. | ||
| 2441 | */ | ||
| 2442 | limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1), | ||
| 2443 | DATA_DEV_BLOCK_SIZE_MIN_SECTORS) << SECTOR_SHIFT; | ||
| 2444 | } | 2520 | } |
| 2445 | 2521 | ||
| 2446 | static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) | 2522 | static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) |
| @@ -2468,7 +2544,7 @@ static struct target_type pool_target = { | |||
| 2468 | .name = "thin-pool", | 2544 | .name = "thin-pool", |
| 2469 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | | 2545 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | |
| 2470 | DM_TARGET_IMMUTABLE, | 2546 | DM_TARGET_IMMUTABLE, |
| 2471 | .version = {1, 6, 0}, | 2547 | .version = {1, 6, 1}, |
| 2472 | .module = THIS_MODULE, | 2548 | .module = THIS_MODULE, |
| 2473 | .ctr = pool_ctr, | 2549 | .ctr = pool_ctr, |
| 2474 | .dtr = pool_dtr, | 2550 | .dtr = pool_dtr, |
| @@ -2588,17 +2664,17 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
| 2588 | if (r) | 2664 | if (r) |
| 2589 | goto bad_thin_open; | 2665 | goto bad_thin_open; |
| 2590 | 2666 | ||
| 2591 | ti->num_flush_requests = 1; | 2667 | ti->num_flush_bios = 1; |
| 2592 | ti->flush_supported = true; | 2668 | ti->flush_supported = true; |
| 2593 | ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook); | 2669 | ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook); |
| 2594 | 2670 | ||
| 2595 | /* In case the pool supports discards, pass them on. */ | 2671 | /* In case the pool supports discards, pass them on. */ |
| 2596 | if (tc->pool->pf.discard_enabled) { | 2672 | if (tc->pool->pf.discard_enabled) { |
| 2597 | ti->discards_supported = true; | 2673 | ti->discards_supported = true; |
| 2598 | ti->num_discard_requests = 1; | 2674 | ti->num_discard_bios = 1; |
| 2599 | ti->discard_zeroes_data_unsupported = true; | 2675 | ti->discard_zeroes_data_unsupported = true; |
| 2600 | /* Discard requests must be split on a block boundary */ | 2676 | /* Discard bios must be split on a block boundary */ |
| 2601 | ti->split_discard_requests = true; | 2677 | ti->split_discard_bios = true; |
| 2602 | } | 2678 | } |
| 2603 | 2679 | ||
| 2604 | dm_put(pool_md); | 2680 | dm_put(pool_md); |
| @@ -2676,8 +2752,8 @@ static void thin_postsuspend(struct dm_target *ti) | |||
| 2676 | /* | 2752 | /* |
| 2677 | * <nr mapped sectors> <highest mapped sector> | 2753 | * <nr mapped sectors> <highest mapped sector> |
| 2678 | */ | 2754 | */ |
| 2679 | static int thin_status(struct dm_target *ti, status_type_t type, | 2755 | static void thin_status(struct dm_target *ti, status_type_t type, |
| 2680 | unsigned status_flags, char *result, unsigned maxlen) | 2756 | unsigned status_flags, char *result, unsigned maxlen) |
| 2681 | { | 2757 | { |
| 2682 | int r; | 2758 | int r; |
| 2683 | ssize_t sz = 0; | 2759 | ssize_t sz = 0; |
| @@ -2687,7 +2763,7 @@ static int thin_status(struct dm_target *ti, status_type_t type, | |||
| 2687 | 2763 | ||
| 2688 | if (get_pool_mode(tc->pool) == PM_FAIL) { | 2764 | if (get_pool_mode(tc->pool) == PM_FAIL) { |
| 2689 | DMEMIT("Fail"); | 2765 | DMEMIT("Fail"); |
| 2690 | return 0; | 2766 | return; |
| 2691 | } | 2767 | } |
| 2692 | 2768 | ||
| 2693 | if (!tc->td) | 2769 | if (!tc->td) |
| @@ -2696,12 +2772,16 @@ static int thin_status(struct dm_target *ti, status_type_t type, | |||
| 2696 | switch (type) { | 2772 | switch (type) { |
| 2697 | case STATUSTYPE_INFO: | 2773 | case STATUSTYPE_INFO: |
| 2698 | r = dm_thin_get_mapped_count(tc->td, &mapped); | 2774 | r = dm_thin_get_mapped_count(tc->td, &mapped); |
| 2699 | if (r) | 2775 | if (r) { |
| 2700 | return r; | 2776 | DMERR("dm_thin_get_mapped_count returned %d", r); |
| 2777 | goto err; | ||
| 2778 | } | ||
| 2701 | 2779 | ||
| 2702 | r = dm_thin_get_highest_mapped_block(tc->td, &highest); | 2780 | r = dm_thin_get_highest_mapped_block(tc->td, &highest); |
| 2703 | if (r < 0) | 2781 | if (r < 0) { |
| 2704 | return r; | 2782 | DMERR("dm_thin_get_highest_mapped_block returned %d", r); |
| 2783 | goto err; | ||
| 2784 | } | ||
| 2705 | 2785 | ||
| 2706 | DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); | 2786 | DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); |
| 2707 | if (r) | 2787 | if (r) |
| @@ -2721,7 +2801,10 @@ static int thin_status(struct dm_target *ti, status_type_t type, | |||
| 2721 | } | 2801 | } |
| 2722 | } | 2802 | } |
| 2723 | 2803 | ||
| 2724 | return 0; | 2804 | return; |
| 2805 | |||
| 2806 | err: | ||
| 2807 | DMEMIT("Error"); | ||
| 2725 | } | 2808 | } |
| 2726 | 2809 | ||
| 2727 | static int thin_iterate_devices(struct dm_target *ti, | 2810 | static int thin_iterate_devices(struct dm_target *ti, |
| @@ -2748,7 +2831,7 @@ static int thin_iterate_devices(struct dm_target *ti, | |||
| 2748 | 2831 | ||
| 2749 | static struct target_type thin_target = { | 2832 | static struct target_type thin_target = { |
| 2750 | .name = "thin", | 2833 | .name = "thin", |
| 2751 | .version = {1, 7, 0}, | 2834 | .version = {1, 7, 1}, |
| 2752 | .module = THIS_MODULE, | 2835 | .module = THIS_MODULE, |
| 2753 | .ctr = thin_ctr, | 2836 | .ctr = thin_ctr, |
| 2754 | .dtr = thin_dtr, | 2837 | .dtr = thin_dtr, |
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 52cde982164a..6ad538375c3c 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c | |||
| @@ -508,8 +508,8 @@ static int verity_map(struct dm_target *ti, struct bio *bio) | |||
| 508 | /* | 508 | /* |
| 509 | * Status: V (valid) or C (corruption found) | 509 | * Status: V (valid) or C (corruption found) |
| 510 | */ | 510 | */ |
| 511 | static int verity_status(struct dm_target *ti, status_type_t type, | 511 | static void verity_status(struct dm_target *ti, status_type_t type, |
| 512 | unsigned status_flags, char *result, unsigned maxlen) | 512 | unsigned status_flags, char *result, unsigned maxlen) |
| 513 | { | 513 | { |
| 514 | struct dm_verity *v = ti->private; | 514 | struct dm_verity *v = ti->private; |
| 515 | unsigned sz = 0; | 515 | unsigned sz = 0; |
| @@ -540,8 +540,6 @@ static int verity_status(struct dm_target *ti, status_type_t type, | |||
| 540 | DMEMIT("%02x", v->salt[x]); | 540 | DMEMIT("%02x", v->salt[x]); |
| 541 | break; | 541 | break; |
| 542 | } | 542 | } |
| 543 | |||
| 544 | return 0; | ||
| 545 | } | 543 | } |
| 546 | 544 | ||
| 547 | static int verity_ioctl(struct dm_target *ti, unsigned cmd, | 545 | static int verity_ioctl(struct dm_target *ti, unsigned cmd, |
| @@ -860,7 +858,7 @@ bad: | |||
| 860 | 858 | ||
| 861 | static struct target_type verity_target = { | 859 | static struct target_type verity_target = { |
| 862 | .name = "verity", | 860 | .name = "verity", |
| 863 | .version = {1, 1, 0}, | 861 | .version = {1, 1, 1}, |
| 864 | .module = THIS_MODULE, | 862 | .module = THIS_MODULE, |
| 865 | .ctr = verity_ctr, | 863 | .ctr = verity_ctr, |
| 866 | .dtr = verity_dtr, | 864 | .dtr = verity_dtr, |
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index 69a5c3b3b340..c99003e0d47a 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c | |||
| @@ -25,7 +25,7 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
| 25 | /* | 25 | /* |
| 26 | * Silently drop discards, avoiding -EOPNOTSUPP. | 26 | * Silently drop discards, avoiding -EOPNOTSUPP. |
| 27 | */ | 27 | */ |
| 28 | ti->num_discard_requests = 1; | 28 | ti->num_discard_bios = 1; |
| 29 | 29 | ||
| 30 | return 0; | 30 | return 0; |
| 31 | } | 31 | } |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 314a0e2faf79..7e469260fe5e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
| @@ -163,7 +163,6 @@ struct mapped_device { | |||
| 163 | * io objects are allocated from here. | 163 | * io objects are allocated from here. |
| 164 | */ | 164 | */ |
| 165 | mempool_t *io_pool; | 165 | mempool_t *io_pool; |
| 166 | mempool_t *tio_pool; | ||
| 167 | 166 | ||
| 168 | struct bio_set *bs; | 167 | struct bio_set *bs; |
| 169 | 168 | ||
| @@ -197,7 +196,6 @@ struct mapped_device { | |||
| 197 | */ | 196 | */ |
| 198 | struct dm_md_mempools { | 197 | struct dm_md_mempools { |
| 199 | mempool_t *io_pool; | 198 | mempool_t *io_pool; |
| 200 | mempool_t *tio_pool; | ||
| 201 | struct bio_set *bs; | 199 | struct bio_set *bs; |
| 202 | }; | 200 | }; |
| 203 | 201 | ||
| @@ -205,12 +203,6 @@ struct dm_md_mempools { | |||
| 205 | static struct kmem_cache *_io_cache; | 203 | static struct kmem_cache *_io_cache; |
| 206 | static struct kmem_cache *_rq_tio_cache; | 204 | static struct kmem_cache *_rq_tio_cache; |
| 207 | 205 | ||
| 208 | /* | ||
| 209 | * Unused now, and needs to be deleted. But since io_pool is overloaded and it's | ||
| 210 | * still used for _io_cache, I'm leaving this for a later cleanup | ||
| 211 | */ | ||
| 212 | static struct kmem_cache *_rq_bio_info_cache; | ||
| 213 | |||
| 214 | static int __init local_init(void) | 206 | static int __init local_init(void) |
| 215 | { | 207 | { |
| 216 | int r = -ENOMEM; | 208 | int r = -ENOMEM; |
| @@ -224,13 +216,9 @@ static int __init local_init(void) | |||
| 224 | if (!_rq_tio_cache) | 216 | if (!_rq_tio_cache) |
| 225 | goto out_free_io_cache; | 217 | goto out_free_io_cache; |
| 226 | 218 | ||
| 227 | _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); | ||
| 228 | if (!_rq_bio_info_cache) | ||
| 229 | goto out_free_rq_tio_cache; | ||
| 230 | |||
| 231 | r = dm_uevent_init(); | 219 | r = dm_uevent_init(); |
| 232 | if (r) | 220 | if (r) |
| 233 | goto out_free_rq_bio_info_cache; | 221 | goto out_free_rq_tio_cache; |
| 234 | 222 | ||
| 235 | _major = major; | 223 | _major = major; |
| 236 | r = register_blkdev(_major, _name); | 224 | r = register_blkdev(_major, _name); |
| @@ -244,8 +232,6 @@ static int __init local_init(void) | |||
| 244 | 232 | ||
| 245 | out_uevent_exit: | 233 | out_uevent_exit: |
| 246 | dm_uevent_exit(); | 234 | dm_uevent_exit(); |
| 247 | out_free_rq_bio_info_cache: | ||
| 248 | kmem_cache_destroy(_rq_bio_info_cache); | ||
| 249 | out_free_rq_tio_cache: | 235 | out_free_rq_tio_cache: |
| 250 | kmem_cache_destroy(_rq_tio_cache); | 236 | kmem_cache_destroy(_rq_tio_cache); |
| 251 | out_free_io_cache: | 237 | out_free_io_cache: |
| @@ -256,7 +242,6 @@ out_free_io_cache: | |||
| 256 | 242 | ||
| 257 | static void local_exit(void) | 243 | static void local_exit(void) |
| 258 | { | 244 | { |
| 259 | kmem_cache_destroy(_rq_bio_info_cache); | ||
| 260 | kmem_cache_destroy(_rq_tio_cache); | 245 | kmem_cache_destroy(_rq_tio_cache); |
| 261 | kmem_cache_destroy(_io_cache); | 246 | kmem_cache_destroy(_io_cache); |
| 262 | unregister_blkdev(_major, _name); | 247 | unregister_blkdev(_major, _name); |
| @@ -318,7 +303,6 @@ static void __exit dm_exit(void) | |||
| 318 | /* | 303 | /* |
| 319 | * Should be empty by this point. | 304 | * Should be empty by this point. |
| 320 | */ | 305 | */ |
| 321 | idr_remove_all(&_minor_idr); | ||
| 322 | idr_destroy(&_minor_idr); | 306 | idr_destroy(&_minor_idr); |
| 323 | } | 307 | } |
| 324 | 308 | ||
| @@ -449,12 +433,12 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio) | |||
| 449 | static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, | 433 | static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, |
| 450 | gfp_t gfp_mask) | 434 | gfp_t gfp_mask) |
| 451 | { | 435 | { |
| 452 | return mempool_alloc(md->tio_pool, gfp_mask); | 436 | return mempool_alloc(md->io_pool, gfp_mask); |
| 453 | } | 437 | } |
| 454 | 438 | ||
| 455 | static void free_rq_tio(struct dm_rq_target_io *tio) | 439 | static void free_rq_tio(struct dm_rq_target_io *tio) |
| 456 | { | 440 | { |
| 457 | mempool_free(tio, tio->md->tio_pool); | 441 | mempool_free(tio, tio->md->io_pool); |
| 458 | } | 442 | } |
| 459 | 443 | ||
| 460 | static int md_in_flight(struct mapped_device *md) | 444 | static int md_in_flight(struct mapped_device *md) |
| @@ -627,7 +611,6 @@ static void dec_pending(struct dm_io *io, int error) | |||
| 627 | queue_io(md, bio); | 611 | queue_io(md, bio); |
| 628 | } else { | 612 | } else { |
| 629 | /* done with normal IO or empty flush */ | 613 | /* done with normal IO or empty flush */ |
| 630 | trace_block_bio_complete(md->queue, bio, io_error); | ||
| 631 | bio_endio(bio, io_error); | 614 | bio_endio(bio, io_error); |
| 632 | } | 615 | } |
| 633 | } | 616 | } |
| @@ -987,12 +970,13 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) | |||
| 987 | } | 970 | } |
| 988 | EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); | 971 | EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); |
| 989 | 972 | ||
| 990 | static void __map_bio(struct dm_target *ti, struct dm_target_io *tio) | 973 | static void __map_bio(struct dm_target_io *tio) |
| 991 | { | 974 | { |
| 992 | int r; | 975 | int r; |
| 993 | sector_t sector; | 976 | sector_t sector; |
| 994 | struct mapped_device *md; | 977 | struct mapped_device *md; |
| 995 | struct bio *clone = &tio->clone; | 978 | struct bio *clone = &tio->clone; |
| 979 | struct dm_target *ti = tio->ti; | ||
| 996 | 980 | ||
| 997 | clone->bi_end_io = clone_endio; | 981 | clone->bi_end_io = clone_endio; |
| 998 | clone->bi_private = tio; | 982 | clone->bi_private = tio; |
| @@ -1033,32 +1017,54 @@ struct clone_info { | |||
| 1033 | unsigned short idx; | 1017 | unsigned short idx; |
| 1034 | }; | 1018 | }; |
| 1035 | 1019 | ||
| 1020 | static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len) | ||
| 1021 | { | ||
| 1022 | bio->bi_sector = sector; | ||
| 1023 | bio->bi_size = to_bytes(len); | ||
| 1024 | } | ||
| 1025 | |||
| 1026 | static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count) | ||
| 1027 | { | ||
| 1028 | bio->bi_idx = idx; | ||
| 1029 | bio->bi_vcnt = idx + bv_count; | ||
| 1030 | bio->bi_flags &= ~(1 << BIO_SEG_VALID); | ||
| 1031 | } | ||
| 1032 | |||
| 1033 | static void clone_bio_integrity(struct bio *bio, struct bio *clone, | ||
| 1034 | unsigned short idx, unsigned len, unsigned offset, | ||
| 1035 | unsigned trim) | ||
| 1036 | { | ||
| 1037 | if (!bio_integrity(bio)) | ||
| 1038 | return; | ||
| 1039 | |||
| 1040 | bio_integrity_clone(clone, bio, GFP_NOIO); | ||
| 1041 | |||
| 1042 | if (trim) | ||
| 1043 | bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len); | ||
| 1044 | } | ||
| 1045 | |||
| 1036 | /* | 1046 | /* |
| 1037 | * Creates a little bio that just does part of a bvec. | 1047 | * Creates a little bio that just does part of a bvec. |
| 1038 | */ | 1048 | */ |
| 1039 | static void split_bvec(struct dm_target_io *tio, struct bio *bio, | 1049 | static void clone_split_bio(struct dm_target_io *tio, struct bio *bio, |
| 1040 | sector_t sector, unsigned short idx, unsigned int offset, | 1050 | sector_t sector, unsigned short idx, |
| 1041 | unsigned int len, struct bio_set *bs) | 1051 | unsigned offset, unsigned len) |
| 1042 | { | 1052 | { |
| 1043 | struct bio *clone = &tio->clone; | 1053 | struct bio *clone = &tio->clone; |
| 1044 | struct bio_vec *bv = bio->bi_io_vec + idx; | 1054 | struct bio_vec *bv = bio->bi_io_vec + idx; |
| 1045 | 1055 | ||
| 1046 | *clone->bi_io_vec = *bv; | 1056 | *clone->bi_io_vec = *bv; |
| 1047 | 1057 | ||
| 1048 | clone->bi_sector = sector; | 1058 | bio_setup_sector(clone, sector, len); |
| 1059 | |||
| 1049 | clone->bi_bdev = bio->bi_bdev; | 1060 | clone->bi_bdev = bio->bi_bdev; |
| 1050 | clone->bi_rw = bio->bi_rw; | 1061 | clone->bi_rw = bio->bi_rw; |
| 1051 | clone->bi_vcnt = 1; | 1062 | clone->bi_vcnt = 1; |
| 1052 | clone->bi_size = to_bytes(len); | ||
| 1053 | clone->bi_io_vec->bv_offset = offset; | 1063 | clone->bi_io_vec->bv_offset = offset; |
| 1054 | clone->bi_io_vec->bv_len = clone->bi_size; | 1064 | clone->bi_io_vec->bv_len = clone->bi_size; |
| 1055 | clone->bi_flags |= 1 << BIO_CLONED; | 1065 | clone->bi_flags |= 1 << BIO_CLONED; |
| 1056 | 1066 | ||
| 1057 | if (bio_integrity(bio)) { | 1067 | clone_bio_integrity(bio, clone, idx, len, offset, 1); |
| 1058 | bio_integrity_clone(clone, bio, GFP_NOIO); | ||
| 1059 | bio_integrity_trim(clone, | ||
| 1060 | bio_sector_offset(bio, idx, offset), len); | ||
| 1061 | } | ||
| 1062 | } | 1068 | } |
| 1063 | 1069 | ||
| 1064 | /* | 1070 | /* |
| @@ -1066,29 +1072,23 @@ static void split_bvec(struct dm_target_io *tio, struct bio *bio, | |||
| 1066 | */ | 1072 | */ |
| 1067 | static void clone_bio(struct dm_target_io *tio, struct bio *bio, | 1073 | static void clone_bio(struct dm_target_io *tio, struct bio *bio, |
| 1068 | sector_t sector, unsigned short idx, | 1074 | sector_t sector, unsigned short idx, |
| 1069 | unsigned short bv_count, unsigned int len, | 1075 | unsigned short bv_count, unsigned len) |
| 1070 | struct bio_set *bs) | ||
| 1071 | { | 1076 | { |
| 1072 | struct bio *clone = &tio->clone; | 1077 | struct bio *clone = &tio->clone; |
| 1078 | unsigned trim = 0; | ||
| 1073 | 1079 | ||
| 1074 | __bio_clone(clone, bio); | 1080 | __bio_clone(clone, bio); |
| 1075 | clone->bi_sector = sector; | 1081 | bio_setup_sector(clone, sector, len); |
| 1076 | clone->bi_idx = idx; | 1082 | bio_setup_bv(clone, idx, bv_count); |
| 1077 | clone->bi_vcnt = idx + bv_count; | 1083 | |
| 1078 | clone->bi_size = to_bytes(len); | 1084 | if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) |
| 1079 | clone->bi_flags &= ~(1 << BIO_SEG_VALID); | 1085 | trim = 1; |
| 1080 | 1086 | clone_bio_integrity(bio, clone, idx, len, 0, trim); | |
| 1081 | if (bio_integrity(bio)) { | ||
| 1082 | bio_integrity_clone(clone, bio, GFP_NOIO); | ||
| 1083 | |||
| 1084 | if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) | ||
| 1085 | bio_integrity_trim(clone, | ||
| 1086 | bio_sector_offset(bio, idx, 0), len); | ||
| 1087 | } | ||
| 1088 | } | 1087 | } |
| 1089 | 1088 | ||
| 1090 | static struct dm_target_io *alloc_tio(struct clone_info *ci, | 1089 | static struct dm_target_io *alloc_tio(struct clone_info *ci, |
| 1091 | struct dm_target *ti, int nr_iovecs) | 1090 | struct dm_target *ti, int nr_iovecs, |
| 1091 | unsigned target_bio_nr) | ||
| 1092 | { | 1092 | { |
| 1093 | struct dm_target_io *tio; | 1093 | struct dm_target_io *tio; |
| 1094 | struct bio *clone; | 1094 | struct bio *clone; |
| @@ -1099,96 +1099,104 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, | |||
| 1099 | tio->io = ci->io; | 1099 | tio->io = ci->io; |
| 1100 | tio->ti = ti; | 1100 | tio->ti = ti; |
| 1101 | memset(&tio->info, 0, sizeof(tio->info)); | 1101 | memset(&tio->info, 0, sizeof(tio->info)); |
| 1102 | tio->target_request_nr = 0; | 1102 | tio->target_bio_nr = target_bio_nr; |
| 1103 | 1103 | ||
| 1104 | return tio; | 1104 | return tio; |
| 1105 | } | 1105 | } |
| 1106 | 1106 | ||
| 1107 | static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, | 1107 | static void __clone_and_map_simple_bio(struct clone_info *ci, |
| 1108 | unsigned request_nr, sector_t len) | 1108 | struct dm_target *ti, |
| 1109 | unsigned target_bio_nr, sector_t len) | ||
| 1109 | { | 1110 | { |
| 1110 | struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs); | 1111 | struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr); |
| 1111 | struct bio *clone = &tio->clone; | 1112 | struct bio *clone = &tio->clone; |
| 1112 | 1113 | ||
| 1113 | tio->target_request_nr = request_nr; | ||
| 1114 | |||
| 1115 | /* | 1114 | /* |
| 1116 | * Discard requests require the bio's inline iovecs be initialized. | 1115 | * Discard requests require the bio's inline iovecs be initialized. |
| 1117 | * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush | 1116 | * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush |
| 1118 | * and discard, so no need for concern about wasted bvec allocations. | 1117 | * and discard, so no need for concern about wasted bvec allocations. |
| 1119 | */ | 1118 | */ |
| 1120 | |||
| 1121 | __bio_clone(clone, ci->bio); | 1119 | __bio_clone(clone, ci->bio); |
| 1122 | if (len) { | 1120 | if (len) |
| 1123 | clone->bi_sector = ci->sector; | 1121 | bio_setup_sector(clone, ci->sector, len); |
| 1124 | clone->bi_size = to_bytes(len); | ||
| 1125 | } | ||
| 1126 | 1122 | ||
| 1127 | __map_bio(ti, tio); | 1123 | __map_bio(tio); |
| 1128 | } | 1124 | } |
| 1129 | 1125 | ||
| 1130 | static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, | 1126 | static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, |
| 1131 | unsigned num_requests, sector_t len) | 1127 | unsigned num_bios, sector_t len) |
| 1132 | { | 1128 | { |
| 1133 | unsigned request_nr; | 1129 | unsigned target_bio_nr; |
| 1134 | 1130 | ||
| 1135 | for (request_nr = 0; request_nr < num_requests; request_nr++) | 1131 | for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) |
| 1136 | __issue_target_request(ci, ti, request_nr, len); | 1132 | __clone_and_map_simple_bio(ci, ti, target_bio_nr, len); |
| 1137 | } | 1133 | } |
| 1138 | 1134 | ||
| 1139 | static int __clone_and_map_empty_flush(struct clone_info *ci) | 1135 | static int __send_empty_flush(struct clone_info *ci) |
| 1140 | { | 1136 | { |
| 1141 | unsigned target_nr = 0; | 1137 | unsigned target_nr = 0; |
| 1142 | struct dm_target *ti; | 1138 | struct dm_target *ti; |
| 1143 | 1139 | ||
| 1144 | BUG_ON(bio_has_data(ci->bio)); | 1140 | BUG_ON(bio_has_data(ci->bio)); |
| 1145 | while ((ti = dm_table_get_target(ci->map, target_nr++))) | 1141 | while ((ti = dm_table_get_target(ci->map, target_nr++))) |
| 1146 | __issue_target_requests(ci, ti, ti->num_flush_requests, 0); | 1142 | __send_duplicate_bios(ci, ti, ti->num_flush_bios, 0); |
| 1147 | 1143 | ||
| 1148 | return 0; | 1144 | return 0; |
| 1149 | } | 1145 | } |
| 1150 | 1146 | ||
| 1151 | /* | 1147 | static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, |
| 1152 | * Perform all io with a single clone. | 1148 | sector_t sector, int nr_iovecs, |
| 1153 | */ | 1149 | unsigned short idx, unsigned short bv_count, |
| 1154 | static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) | 1150 | unsigned offset, unsigned len, |
| 1151 | unsigned split_bvec) | ||
| 1155 | { | 1152 | { |
| 1156 | struct bio *bio = ci->bio; | 1153 | struct bio *bio = ci->bio; |
| 1157 | struct dm_target_io *tio; | 1154 | struct dm_target_io *tio; |
| 1155 | unsigned target_bio_nr; | ||
| 1156 | unsigned num_target_bios = 1; | ||
| 1158 | 1157 | ||
| 1159 | tio = alloc_tio(ci, ti, bio->bi_max_vecs); | 1158 | /* |
| 1160 | clone_bio(tio, bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx, | 1159 | * Does the target want to receive duplicate copies of the bio? |
| 1161 | ci->sector_count, ci->md->bs); | 1160 | */ |
| 1162 | __map_bio(ti, tio); | 1161 | if (bio_data_dir(bio) == WRITE && ti->num_write_bios) |
| 1163 | ci->sector_count = 0; | 1162 | num_target_bios = ti->num_write_bios(ti, bio); |
| 1163 | |||
| 1164 | for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { | ||
| 1165 | tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr); | ||
| 1166 | if (split_bvec) | ||
| 1167 | clone_split_bio(tio, bio, sector, idx, offset, len); | ||
| 1168 | else | ||
| 1169 | clone_bio(tio, bio, sector, idx, bv_count, len); | ||
| 1170 | __map_bio(tio); | ||
| 1171 | } | ||
| 1164 | } | 1172 | } |
| 1165 | 1173 | ||
| 1166 | typedef unsigned (*get_num_requests_fn)(struct dm_target *ti); | 1174 | typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); |
| 1167 | 1175 | ||
| 1168 | static unsigned get_num_discard_requests(struct dm_target *ti) | 1176 | static unsigned get_num_discard_bios(struct dm_target *ti) |
| 1169 | { | 1177 | { |
| 1170 | return ti->num_discard_requests; | 1178 | return ti->num_discard_bios; |
| 1171 | } | 1179 | } |
| 1172 | 1180 | ||
| 1173 | static unsigned get_num_write_same_requests(struct dm_target *ti) | 1181 | static unsigned get_num_write_same_bios(struct dm_target *ti) |
| 1174 | { | 1182 | { |
| 1175 | return ti->num_write_same_requests; | 1183 | return ti->num_write_same_bios; |
| 1176 | } | 1184 | } |
| 1177 | 1185 | ||
| 1178 | typedef bool (*is_split_required_fn)(struct dm_target *ti); | 1186 | typedef bool (*is_split_required_fn)(struct dm_target *ti); |
| 1179 | 1187 | ||
| 1180 | static bool is_split_required_for_discard(struct dm_target *ti) | 1188 | static bool is_split_required_for_discard(struct dm_target *ti) |
| 1181 | { | 1189 | { |
| 1182 | return ti->split_discard_requests; | 1190 | return ti->split_discard_bios; |
| 1183 | } | 1191 | } |
| 1184 | 1192 | ||
| 1185 | static int __clone_and_map_changing_extent_only(struct clone_info *ci, | 1193 | static int __send_changing_extent_only(struct clone_info *ci, |
| 1186 | get_num_requests_fn get_num_requests, | 1194 | get_num_bios_fn get_num_bios, |
| 1187 | is_split_required_fn is_split_required) | 1195 | is_split_required_fn is_split_required) |
| 1188 | { | 1196 | { |
| 1189 | struct dm_target *ti; | 1197 | struct dm_target *ti; |
| 1190 | sector_t len; | 1198 | sector_t len; |
| 1191 | unsigned num_requests; | 1199 | unsigned num_bios; |
| 1192 | 1200 | ||
| 1193 | do { | 1201 | do { |
| 1194 | ti = dm_table_find_target(ci->map, ci->sector); | 1202 | ti = dm_table_find_target(ci->map, ci->sector); |
| @@ -1201,8 +1209,8 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci, | |||
| 1201 | * reconfiguration might also have changed that since the | 1209 | * reconfiguration might also have changed that since the |
| 1202 | * check was performed. | 1210 | * check was performed. |
| 1203 | */ | 1211 | */ |
| 1204 | num_requests = get_num_requests ? get_num_requests(ti) : 0; | 1212 | num_bios = get_num_bios ? get_num_bios(ti) : 0; |
| 1205 | if (!num_requests) | 1213 | if (!num_bios) |
| 1206 | return -EOPNOTSUPP; | 1214 | return -EOPNOTSUPP; |
| 1207 | 1215 | ||
| 1208 | if (is_split_required && !is_split_required(ti)) | 1216 | if (is_split_required && !is_split_required(ti)) |
| @@ -1210,7 +1218,7 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci, | |||
| 1210 | else | 1218 | else |
| 1211 | len = min(ci->sector_count, max_io_len(ci->sector, ti)); | 1219 | len = min(ci->sector_count, max_io_len(ci->sector, ti)); |
| 1212 | 1220 | ||
| 1213 | __issue_target_requests(ci, ti, num_requests, len); | 1221 | __send_duplicate_bios(ci, ti, num_bios, len); |
| 1214 | 1222 | ||
| 1215 | ci->sector += len; | 1223 | ci->sector += len; |
| 1216 | } while (ci->sector_count -= len); | 1224 | } while (ci->sector_count -= len); |
| @@ -1218,108 +1226,129 @@ static int __clone_and_map_changing_extent_only(struct clone_info *ci, | |||
| 1218 | return 0; | 1226 | return 0; |
| 1219 | } | 1227 | } |
| 1220 | 1228 | ||
| 1221 | static int __clone_and_map_discard(struct clone_info *ci) | 1229 | static int __send_discard(struct clone_info *ci) |
| 1222 | { | 1230 | { |
| 1223 | return __clone_and_map_changing_extent_only(ci, get_num_discard_requests, | 1231 | return __send_changing_extent_only(ci, get_num_discard_bios, |
| 1224 | is_split_required_for_discard); | 1232 | is_split_required_for_discard); |
| 1225 | } | 1233 | } |
| 1226 | 1234 | ||
| 1227 | static int __clone_and_map_write_same(struct clone_info *ci) | 1235 | static int __send_write_same(struct clone_info *ci) |
| 1228 | { | 1236 | { |
| 1229 | return __clone_and_map_changing_extent_only(ci, get_num_write_same_requests, NULL); | 1237 | return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); |
| 1230 | } | 1238 | } |
| 1231 | 1239 | ||
| 1232 | static int __clone_and_map(struct clone_info *ci) | 1240 | /* |
| 1241 | * Find maximum number of sectors / bvecs we can process with a single bio. | ||
| 1242 | */ | ||
| 1243 | static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx) | ||
| 1233 | { | 1244 | { |
| 1234 | struct bio *bio = ci->bio; | 1245 | struct bio *bio = ci->bio; |
| 1235 | struct dm_target *ti; | 1246 | sector_t bv_len, total_len = 0; |
| 1236 | sector_t len = 0, max; | ||
| 1237 | struct dm_target_io *tio; | ||
| 1238 | 1247 | ||
| 1239 | if (unlikely(bio->bi_rw & REQ_DISCARD)) | 1248 | for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) { |
| 1240 | return __clone_and_map_discard(ci); | 1249 | bv_len = to_sector(bio->bi_io_vec[*idx].bv_len); |
| 1241 | else if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) | ||
| 1242 | return __clone_and_map_write_same(ci); | ||
| 1243 | 1250 | ||
| 1244 | ti = dm_table_find_target(ci->map, ci->sector); | 1251 | if (bv_len > max) |
| 1245 | if (!dm_target_is_valid(ti)) | 1252 | break; |
| 1246 | return -EIO; | ||
| 1247 | |||
| 1248 | max = max_io_len(ci->sector, ti); | ||
| 1249 | 1253 | ||
| 1250 | if (ci->sector_count <= max) { | 1254 | max -= bv_len; |
| 1251 | /* | 1255 | total_len += bv_len; |
| 1252 | * Optimise for the simple case where we can do all of | 1256 | } |
| 1253 | * the remaining io with a single clone. | ||
| 1254 | */ | ||
| 1255 | __clone_and_map_simple(ci, ti); | ||
| 1256 | 1257 | ||
| 1257 | } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { | 1258 | return total_len; |
| 1258 | /* | 1259 | } |
| 1259 | * There are some bvecs that don't span targets. | ||
| 1260 | * Do as many of these as possible. | ||
| 1261 | */ | ||
| 1262 | int i; | ||
| 1263 | sector_t remaining = max; | ||
| 1264 | sector_t bv_len; | ||
| 1265 | 1260 | ||
| 1266 | for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { | 1261 | static int __split_bvec_across_targets(struct clone_info *ci, |
| 1267 | bv_len = to_sector(bio->bi_io_vec[i].bv_len); | 1262 | struct dm_target *ti, sector_t max) |
| 1263 | { | ||
| 1264 | struct bio *bio = ci->bio; | ||
| 1265 | struct bio_vec *bv = bio->bi_io_vec + ci->idx; | ||
| 1266 | sector_t remaining = to_sector(bv->bv_len); | ||
| 1267 | unsigned offset = 0; | ||
| 1268 | sector_t len; | ||
| 1268 | 1269 | ||
| 1269 | if (bv_len > remaining) | 1270 | do { |
| 1270 | break; | 1271 | if (offset) { |
| 1272 | ti = dm_table_find_target(ci->map, ci->sector); | ||
| 1273 | if (!dm_target_is_valid(ti)) | ||
| 1274 | return -EIO; | ||
| 1271 | 1275 | ||
| 1272 | remaining -= bv_len; | 1276 | max = max_io_len(ci->sector, ti); |
| 1273 | len += bv_len; | ||
| 1274 | } | 1277 | } |
| 1275 | 1278 | ||
| 1276 | tio = alloc_tio(ci, ti, bio->bi_max_vecs); | 1279 | len = min(remaining, max); |
| 1277 | clone_bio(tio, bio, ci->sector, ci->idx, i - ci->idx, len, | 1280 | |
| 1278 | ci->md->bs); | 1281 | __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0, |
| 1279 | __map_bio(ti, tio); | 1282 | bv->bv_offset + offset, len, 1); |
| 1280 | 1283 | ||
| 1281 | ci->sector += len; | 1284 | ci->sector += len; |
| 1282 | ci->sector_count -= len; | 1285 | ci->sector_count -= len; |
| 1283 | ci->idx = i; | 1286 | offset += to_bytes(len); |
| 1287 | } while (remaining -= len); | ||
| 1284 | 1288 | ||
| 1285 | } else { | 1289 | ci->idx++; |
| 1286 | /* | ||
| 1287 | * Handle a bvec that must be split between two or more targets. | ||
| 1288 | */ | ||
| 1289 | struct bio_vec *bv = bio->bi_io_vec + ci->idx; | ||
| 1290 | sector_t remaining = to_sector(bv->bv_len); | ||
| 1291 | unsigned int offset = 0; | ||
| 1292 | 1290 | ||
| 1293 | do { | 1291 | return 0; |
| 1294 | if (offset) { | 1292 | } |
| 1295 | ti = dm_table_find_target(ci->map, ci->sector); | ||
| 1296 | if (!dm_target_is_valid(ti)) | ||
| 1297 | return -EIO; | ||
| 1298 | 1293 | ||
| 1299 | max = max_io_len(ci->sector, ti); | 1294 | /* |
| 1300 | } | 1295 | * Select the correct strategy for processing a non-flush bio. |
| 1296 | */ | ||
| 1297 | static int __split_and_process_non_flush(struct clone_info *ci) | ||
| 1298 | { | ||
| 1299 | struct bio *bio = ci->bio; | ||
| 1300 | struct dm_target *ti; | ||
| 1301 | sector_t len, max; | ||
| 1302 | int idx; | ||
| 1301 | 1303 | ||
| 1302 | len = min(remaining, max); | 1304 | if (unlikely(bio->bi_rw & REQ_DISCARD)) |
| 1305 | return __send_discard(ci); | ||
| 1306 | else if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) | ||
| 1307 | return __send_write_same(ci); | ||
| 1303 | 1308 | ||
| 1304 | tio = alloc_tio(ci, ti, 1); | 1309 | ti = dm_table_find_target(ci->map, ci->sector); |
| 1305 | split_bvec(tio, bio, ci->sector, ci->idx, | 1310 | if (!dm_target_is_valid(ti)) |
| 1306 | bv->bv_offset + offset, len, ci->md->bs); | 1311 | return -EIO; |
| 1307 | 1312 | ||
| 1308 | __map_bio(ti, tio); | 1313 | max = max_io_len(ci->sector, ti); |
| 1314 | |||
| 1315 | /* | ||
| 1316 | * Optimise for the simple case where we can do all of | ||
| 1317 | * the remaining io with a single clone. | ||
| 1318 | */ | ||
| 1319 | if (ci->sector_count <= max) { | ||
| 1320 | __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs, | ||
| 1321 | ci->idx, bio->bi_vcnt - ci->idx, 0, | ||
| 1322 | ci->sector_count, 0); | ||
| 1323 | ci->sector_count = 0; | ||
| 1324 | return 0; | ||
| 1325 | } | ||
| 1326 | |||
| 1327 | /* | ||
| 1328 | * There are some bvecs that don't span targets. | ||
| 1329 | * Do as many of these as possible. | ||
| 1330 | */ | ||
| 1331 | if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { | ||
| 1332 | len = __len_within_target(ci, max, &idx); | ||
| 1309 | 1333 | ||
| 1310 | ci->sector += len; | 1334 | __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs, |
| 1311 | ci->sector_count -= len; | 1335 | ci->idx, idx - ci->idx, 0, len, 0); |
| 1312 | offset += to_bytes(len); | ||
| 1313 | } while (remaining -= len); | ||
| 1314 | 1336 | ||
| 1315 | ci->idx++; | 1337 | ci->sector += len; |
| 1338 | ci->sector_count -= len; | ||
| 1339 | ci->idx = idx; | ||
| 1340 | |||
| 1341 | return 0; | ||
| 1316 | } | 1342 | } |
| 1317 | 1343 | ||
| 1318 | return 0; | 1344 | /* |
| 1345 | * Handle a bvec that must be split between two or more targets. | ||
| 1346 | */ | ||
| 1347 | return __split_bvec_across_targets(ci, ti, max); | ||
| 1319 | } | 1348 | } |
| 1320 | 1349 | ||
| 1321 | /* | 1350 | /* |
| 1322 | * Split the bio into several clones and submit it to targets. | 1351 | * Entry point to split a bio into clones and submit them to the targets. |
| 1323 | */ | 1352 | */ |
| 1324 | static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | 1353 | static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) |
| 1325 | { | 1354 | { |
| @@ -1343,16 +1372,17 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) | |||
| 1343 | ci.idx = bio->bi_idx; | 1372 | ci.idx = bio->bi_idx; |
| 1344 | 1373 | ||
| 1345 | start_io_acct(ci.io); | 1374 | start_io_acct(ci.io); |
| 1375 | |||
| 1346 | if (bio->bi_rw & REQ_FLUSH) { | 1376 | if (bio->bi_rw & REQ_FLUSH) { |
| 1347 | ci.bio = &ci.md->flush_bio; | 1377 | ci.bio = &ci.md->flush_bio; |
| 1348 | ci.sector_count = 0; | 1378 | ci.sector_count = 0; |
| 1349 | error = __clone_and_map_empty_flush(&ci); | 1379 | error = __send_empty_flush(&ci); |
| 1350 | /* dec_pending submits any data associated with flush */ | 1380 | /* dec_pending submits any data associated with flush */ |
| 1351 | } else { | 1381 | } else { |
| 1352 | ci.bio = bio; | 1382 | ci.bio = bio; |
| 1353 | ci.sector_count = bio_sectors(bio); | 1383 | ci.sector_count = bio_sectors(bio); |
| 1354 | while (ci.sector_count && !error) | 1384 | while (ci.sector_count && !error) |
| 1355 | error = __clone_and_map(&ci); | 1385 | error = __split_and_process_non_flush(&ci); |
| 1356 | } | 1386 | } |
| 1357 | 1387 | ||
| 1358 | /* drop the extra reference count */ | 1388 | /* drop the extra reference count */ |
| @@ -1756,62 +1786,38 @@ static void free_minor(int minor) | |||
| 1756 | */ | 1786 | */ |
| 1757 | static int specific_minor(int minor) | 1787 | static int specific_minor(int minor) |
| 1758 | { | 1788 | { |
| 1759 | int r, m; | 1789 | int r; |
| 1760 | 1790 | ||
| 1761 | if (minor >= (1 << MINORBITS)) | 1791 | if (minor >= (1 << MINORBITS)) |
| 1762 | return -EINVAL; | 1792 | return -EINVAL; |
| 1763 | 1793 | ||
| 1764 | r = idr_pre_get(&_minor_idr, GFP_KERNEL); | 1794 | idr_preload(GFP_KERNEL); |
| 1765 | if (!r) | ||
| 1766 | return -ENOMEM; | ||
| 1767 | |||
| 1768 | spin_lock(&_minor_lock); | 1795 | spin_lock(&_minor_lock); |
| 1769 | 1796 | ||
| 1770 | if (idr_find(&_minor_idr, minor)) { | 1797 | r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); |
| 1771 | r = -EBUSY; | ||
| 1772 | goto out; | ||
| 1773 | } | ||
| 1774 | 1798 | ||
| 1775 | r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); | ||
| 1776 | if (r) | ||
| 1777 | goto out; | ||
| 1778 | |||
| 1779 | if (m != minor) { | ||
| 1780 | idr_remove(&_minor_idr, m); | ||
| 1781 | r = -EBUSY; | ||
| 1782 | goto out; | ||
| 1783 | } | ||
| 1784 | |||
| 1785 | out: | ||
| 1786 | spin_unlock(&_minor_lock); | 1799 | spin_unlock(&_minor_lock); |
| 1787 | return r; | 1800 | idr_preload_end(); |
| 1801 | if (r < 0) | ||
| 1802 | return r == -ENOSPC ? -EBUSY : r; | ||
| 1803 | return 0; | ||
| 1788 | } | 1804 | } |
| 1789 | 1805 | ||
| 1790 | static int next_free_minor(int *minor) | 1806 | static int next_free_minor(int *minor) |
| 1791 | { | 1807 | { |
| 1792 | int r, m; | 1808 | int r; |
| 1793 | |||
| 1794 | r = idr_pre_get(&_minor_idr, GFP_KERNEL); | ||
| 1795 | if (!r) | ||
| 1796 | return -ENOMEM; | ||
| 1797 | 1809 | ||
| 1810 | idr_preload(GFP_KERNEL); | ||
| 1798 | spin_lock(&_minor_lock); | 1811 | spin_lock(&_minor_lock); |
| 1799 | 1812 | ||
| 1800 | r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); | 1813 | r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); |
| 1801 | if (r) | ||
| 1802 | goto out; | ||
| 1803 | 1814 | ||
| 1804 | if (m >= (1 << MINORBITS)) { | ||
| 1805 | idr_remove(&_minor_idr, m); | ||
| 1806 | r = -ENOSPC; | ||
| 1807 | goto out; | ||
| 1808 | } | ||
| 1809 | |||
| 1810 | *minor = m; | ||
| 1811 | |||
| 1812 | out: | ||
| 1813 | spin_unlock(&_minor_lock); | 1815 | spin_unlock(&_minor_lock); |
| 1814 | return r; | 1816 | idr_preload_end(); |
| 1817 | if (r < 0) | ||
| 1818 | return r; | ||
| 1819 | *minor = r; | ||
| 1820 | return 0; | ||
| 1815 | } | 1821 | } |
| 1816 | 1822 | ||
| 1817 | static const struct block_device_operations dm_blk_dops; | 1823 | static const struct block_device_operations dm_blk_dops; |
| @@ -1949,8 +1955,6 @@ static void free_dev(struct mapped_device *md) | |||
| 1949 | unlock_fs(md); | 1955 | unlock_fs(md); |
| 1950 | bdput(md->bdev); | 1956 | bdput(md->bdev); |
| 1951 | destroy_workqueue(md->wq); | 1957 | destroy_workqueue(md->wq); |
| 1952 | if (md->tio_pool) | ||
| 1953 | mempool_destroy(md->tio_pool); | ||
| 1954 | if (md->io_pool) | 1958 | if (md->io_pool) |
| 1955 | mempool_destroy(md->io_pool); | 1959 | mempool_destroy(md->io_pool); |
| 1956 | if (md->bs) | 1960 | if (md->bs) |
| @@ -1973,24 +1977,33 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t) | |||
| 1973 | { | 1977 | { |
| 1974 | struct dm_md_mempools *p = dm_table_get_md_mempools(t); | 1978 | struct dm_md_mempools *p = dm_table_get_md_mempools(t); |
| 1975 | 1979 | ||
| 1976 | if (md->io_pool && (md->tio_pool || dm_table_get_type(t) == DM_TYPE_BIO_BASED) && md->bs) { | 1980 | if (md->io_pool && md->bs) { |
| 1977 | /* | 1981 | /* The md already has necessary mempools. */ |
| 1978 | * The md already has necessary mempools. Reload just the | 1982 | if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { |
| 1979 | * bioset because front_pad may have changed because | 1983 | /* |
| 1980 | * a different table was loaded. | 1984 | * Reload bioset because front_pad may have changed |
| 1981 | */ | 1985 | * because a different table was loaded. |
| 1982 | bioset_free(md->bs); | 1986 | */ |
| 1983 | md->bs = p->bs; | 1987 | bioset_free(md->bs); |
| 1984 | p->bs = NULL; | 1988 | md->bs = p->bs; |
| 1989 | p->bs = NULL; | ||
| 1990 | } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) { | ||
| 1991 | /* | ||
| 1992 | * There's no need to reload with request-based dm | ||
| 1993 | * because the size of front_pad doesn't change. | ||
| 1994 | * Note for future: If you are to reload bioset, | ||
| 1995 | * prep-ed requests in the queue may refer | ||
| 1996 | * to bio from the old bioset, so you must walk | ||
| 1997 | * through the queue to unprep. | ||
| 1998 | */ | ||
| 1999 | } | ||
| 1985 | goto out; | 2000 | goto out; |
| 1986 | } | 2001 | } |
| 1987 | 2002 | ||
| 1988 | BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); | 2003 | BUG_ON(!p || md->io_pool || md->bs); |
| 1989 | 2004 | ||
| 1990 | md->io_pool = p->io_pool; | 2005 | md->io_pool = p->io_pool; |
| 1991 | p->io_pool = NULL; | 2006 | p->io_pool = NULL; |
| 1992 | md->tio_pool = p->tio_pool; | ||
| 1993 | p->tio_pool = NULL; | ||
| 1994 | md->bs = p->bs; | 2007 | md->bs = p->bs; |
| 1995 | p->bs = NULL; | 2008 | p->bs = NULL; |
| 1996 | 2009 | ||
| @@ -2421,7 +2434,7 @@ static void dm_queue_flush(struct mapped_device *md) | |||
| 2421 | */ | 2434 | */ |
| 2422 | struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) | 2435 | struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) |
| 2423 | { | 2436 | { |
| 2424 | struct dm_table *live_map, *map = ERR_PTR(-EINVAL); | 2437 | struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); |
| 2425 | struct queue_limits limits; | 2438 | struct queue_limits limits; |
| 2426 | int r; | 2439 | int r; |
| 2427 | 2440 | ||
| @@ -2444,10 +2457,12 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) | |||
| 2444 | dm_table_put(live_map); | 2457 | dm_table_put(live_map); |
| 2445 | } | 2458 | } |
| 2446 | 2459 | ||
| 2447 | r = dm_calculate_queue_limits(table, &limits); | 2460 | if (!live_map) { |
| 2448 | if (r) { | 2461 | r = dm_calculate_queue_limits(table, &limits); |
| 2449 | map = ERR_PTR(r); | 2462 | if (r) { |
| 2450 | goto out; | 2463 | map = ERR_PTR(r); |
| 2464 | goto out; | ||
| 2465 | } | ||
| 2451 | } | 2466 | } |
| 2452 | 2467 | ||
| 2453 | map = __bind(md, table, &limits); | 2468 | map = __bind(md, table, &limits); |
| @@ -2745,52 +2760,42 @@ EXPORT_SYMBOL_GPL(dm_noflush_suspending); | |||
| 2745 | 2760 | ||
| 2746 | struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size) | 2761 | struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size) |
| 2747 | { | 2762 | { |
| 2748 | struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); | 2763 | struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); |
| 2749 | unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS; | 2764 | struct kmem_cache *cachep; |
| 2765 | unsigned int pool_size; | ||
| 2766 | unsigned int front_pad; | ||
| 2750 | 2767 | ||
| 2751 | if (!pools) | 2768 | if (!pools) |
| 2752 | return NULL; | 2769 | return NULL; |
| 2753 | 2770 | ||
| 2754 | per_bio_data_size = roundup(per_bio_data_size, __alignof__(struct dm_target_io)); | 2771 | if (type == DM_TYPE_BIO_BASED) { |
| 2772 | cachep = _io_cache; | ||
| 2773 | pool_size = 16; | ||
| 2774 | front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); | ||
| 2775 | } else if (type == DM_TYPE_REQUEST_BASED) { | ||
| 2776 | cachep = _rq_tio_cache; | ||
| 2777 | pool_size = MIN_IOS; | ||
| 2778 | front_pad = offsetof(struct dm_rq_clone_bio_info, clone); | ||
| 2779 | /* per_bio_data_size is not used. See __bind_mempools(). */ | ||
| 2780 | WARN_ON(per_bio_data_size != 0); | ||
| 2781 | } else | ||
| 2782 | goto out; | ||
| 2755 | 2783 | ||
| 2756 | pools->io_pool = (type == DM_TYPE_BIO_BASED) ? | 2784 | pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep); |
| 2757 | mempool_create_slab_pool(MIN_IOS, _io_cache) : | ||
| 2758 | mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); | ||
| 2759 | if (!pools->io_pool) | 2785 | if (!pools->io_pool) |
| 2760 | goto free_pools_and_out; | 2786 | goto out; |
| 2761 | |||
| 2762 | pools->tio_pool = NULL; | ||
| 2763 | if (type == DM_TYPE_REQUEST_BASED) { | ||
| 2764 | pools->tio_pool = mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); | ||
| 2765 | if (!pools->tio_pool) | ||
| 2766 | goto free_io_pool_and_out; | ||
| 2767 | } | ||
| 2768 | 2787 | ||
| 2769 | pools->bs = (type == DM_TYPE_BIO_BASED) ? | 2788 | pools->bs = bioset_create(pool_size, front_pad); |
| 2770 | bioset_create(pool_size, | ||
| 2771 | per_bio_data_size + offsetof(struct dm_target_io, clone)) : | ||
| 2772 | bioset_create(pool_size, | ||
| 2773 | offsetof(struct dm_rq_clone_bio_info, clone)); | ||
| 2774 | if (!pools->bs) | 2789 | if (!pools->bs) |
| 2775 | goto free_tio_pool_and_out; | 2790 | goto out; |
| 2776 | 2791 | ||
| 2777 | if (integrity && bioset_integrity_create(pools->bs, pool_size)) | 2792 | if (integrity && bioset_integrity_create(pools->bs, pool_size)) |
| 2778 | goto free_bioset_and_out; | 2793 | goto out; |
| 2779 | 2794 | ||
| 2780 | return pools; | 2795 | return pools; |
| 2781 | 2796 | ||
| 2782 | free_bioset_and_out: | 2797 | out: |
| 2783 | bioset_free(pools->bs); | 2798 | dm_free_md_mempools(pools); |
| 2784 | |||
| 2785 | free_tio_pool_and_out: | ||
| 2786 | if (pools->tio_pool) | ||
| 2787 | mempool_destroy(pools->tio_pool); | ||
| 2788 | |||
| 2789 | free_io_pool_and_out: | ||
| 2790 | mempool_destroy(pools->io_pool); | ||
| 2791 | |||
| 2792 | free_pools_and_out: | ||
| 2793 | kfree(pools); | ||
| 2794 | 2799 | ||
| 2795 | return NULL; | 2800 | return NULL; |
| 2796 | } | 2801 | } |
| @@ -2803,9 +2808,6 @@ void dm_free_md_mempools(struct dm_md_mempools *pools) | |||
| 2803 | if (pools->io_pool) | 2808 | if (pools->io_pool) |
| 2804 | mempool_destroy(pools->io_pool); | 2809 | mempool_destroy(pools->io_pool); |
| 2805 | 2810 | ||
| 2806 | if (pools->tio_pool) | ||
| 2807 | mempool_destroy(pools->tio_pool); | ||
| 2808 | |||
| 2809 | if (pools->bs) | 2811 | if (pools->bs) |
| 2810 | bioset_free(pools->bs); | 2812 | bioset_free(pools->bs); |
| 2811 | 2813 | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index 3db3d1b271f7..fcb878f88796 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -307,6 +307,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio) | |||
| 307 | bio_io_error(bio); | 307 | bio_io_error(bio); |
| 308 | return; | 308 | return; |
| 309 | } | 309 | } |
| 310 | if (mddev->ro == 1 && unlikely(rw == WRITE)) { | ||
| 311 | bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS); | ||
| 312 | return; | ||
| 313 | } | ||
| 310 | smp_rmb(); /* Ensure implications of 'active' are visible */ | 314 | smp_rmb(); /* Ensure implications of 'active' are visible */ |
| 311 | rcu_read_lock(); | 315 | rcu_read_lock(); |
| 312 | if (mddev->suspended) { | 316 | if (mddev->suspended) { |
| @@ -2994,6 +2998,9 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
| 2994 | } else if (!sectors) | 2998 | } else if (!sectors) |
| 2995 | sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - | 2999 | sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - |
| 2996 | rdev->data_offset; | 3000 | rdev->data_offset; |
| 3001 | if (!my_mddev->pers->resize) | ||
| 3002 | /* Cannot change size for RAID0 or Linear etc */ | ||
| 3003 | return -EINVAL; | ||
| 2997 | } | 3004 | } |
| 2998 | if (sectors < my_mddev->dev_sectors) | 3005 | if (sectors < my_mddev->dev_sectors) |
| 2999 | return -EINVAL; /* component must fit device */ | 3006 | return -EINVAL; /* component must fit device */ |
| @@ -6525,7 +6532,17 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
| 6525 | mddev->ro = 0; | 6532 | mddev->ro = 0; |
| 6526 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 6533 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
| 6527 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 6534 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| 6528 | md_wakeup_thread(mddev->thread); | 6535 | /* mddev_unlock will wake thread */ |
| 6536 | /* If a device failed while we were read-only, we | ||
| 6537 | * need to make sure the metadata is updated now. | ||
| 6538 | */ | ||
| 6539 | if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { | ||
| 6540 | mddev_unlock(mddev); | ||
| 6541 | wait_event(mddev->sb_wait, | ||
| 6542 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) && | ||
| 6543 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | ||
| 6544 | mddev_lock(mddev); | ||
| 6545 | } | ||
| 6529 | } else { | 6546 | } else { |
| 6530 | err = -EROFS; | 6547 | err = -EROFS; |
| 6531 | goto abort_unlock; | 6548 | goto abort_unlock; |
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig index ceb359050a59..19b268795415 100644 --- a/drivers/md/persistent-data/Kconfig +++ b/drivers/md/persistent-data/Kconfig | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | config DM_PERSISTENT_DATA | 1 | config DM_PERSISTENT_DATA |
| 2 | tristate | 2 | tristate |
| 3 | depends on BLK_DEV_DM && EXPERIMENTAL | 3 | depends on BLK_DEV_DM |
| 4 | select LIBCRC32C | 4 | select LIBCRC32C |
| 5 | select DM_BUFIO | 5 | select DM_BUFIO |
| 6 | ---help--- | 6 | ---help--- |
diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile index d8e7cb767c1e..ff528792c358 100644 --- a/drivers/md/persistent-data/Makefile +++ b/drivers/md/persistent-data/Makefile | |||
| @@ -1,5 +1,7 @@ | |||
| 1 | obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o | 1 | obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o |
| 2 | dm-persistent-data-objs := \ | 2 | dm-persistent-data-objs := \ |
| 3 | dm-array.o \ | ||
| 4 | dm-bitset.o \ | ||
| 3 | dm-block-manager.o \ | 5 | dm-block-manager.o \ |
| 4 | dm-space-map-common.o \ | 6 | dm-space-map-common.o \ |
| 5 | dm-space-map-disk.o \ | 7 | dm-space-map-disk.o \ |
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c new file mode 100644 index 000000000000..172147eb1d40 --- /dev/null +++ b/drivers/md/persistent-data/dm-array.c | |||
| @@ -0,0 +1,808 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * This file is released under the GPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include "dm-array.h" | ||
| 8 | #include "dm-space-map.h" | ||
| 9 | #include "dm-transaction-manager.h" | ||
| 10 | |||
| 11 | #include <linux/export.h> | ||
| 12 | #include <linux/device-mapper.h> | ||
| 13 | |||
| 14 | #define DM_MSG_PREFIX "array" | ||
| 15 | |||
| 16 | /*----------------------------------------------------------------*/ | ||
| 17 | |||
| 18 | /* | ||
| 19 | * The array is implemented as a fully populated btree, which points to | ||
| 20 | * blocks that contain the packed values. This is more space efficient | ||
| 21 | * than just using a btree since we don't store 1 key per value. | ||
| 22 | */ | ||
| 23 | struct array_block { | ||
| 24 | __le32 csum; | ||
| 25 | __le32 max_entries; | ||
| 26 | __le32 nr_entries; | ||
| 27 | __le32 value_size; | ||
| 28 | __le64 blocknr; /* Block this node is supposed to live in. */ | ||
| 29 | } __packed; | ||
| 30 | |||
| 31 | /*----------------------------------------------------------------*/ | ||
| 32 | |||
| 33 | /* | ||
| 34 | * Validator methods. As usual we calculate a checksum, and also write the | ||
| 35 | * block location into the header (paranoia about ssds remapping areas by | ||
| 36 | * mistake). | ||
| 37 | */ | ||
| 38 | #define CSUM_XOR 595846735 | ||
| 39 | |||
| 40 | static void array_block_prepare_for_write(struct dm_block_validator *v, | ||
| 41 | struct dm_block *b, | ||
| 42 | size_t size_of_block) | ||
| 43 | { | ||
| 44 | struct array_block *bh_le = dm_block_data(b); | ||
| 45 | |||
| 46 | bh_le->blocknr = cpu_to_le64(dm_block_location(b)); | ||
| 47 | bh_le->csum = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, | ||
| 48 | size_of_block - sizeof(__le32), | ||
| 49 | CSUM_XOR)); | ||
| 50 | } | ||
| 51 | |||
| 52 | static int array_block_check(struct dm_block_validator *v, | ||
| 53 | struct dm_block *b, | ||
| 54 | size_t size_of_block) | ||
| 55 | { | ||
| 56 | struct array_block *bh_le = dm_block_data(b); | ||
| 57 | __le32 csum_disk; | ||
| 58 | |||
| 59 | if (dm_block_location(b) != le64_to_cpu(bh_le->blocknr)) { | ||
| 60 | DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu", | ||
| 61 | (unsigned long long) le64_to_cpu(bh_le->blocknr), | ||
| 62 | (unsigned long long) dm_block_location(b)); | ||
| 63 | return -ENOTBLK; | ||
| 64 | } | ||
| 65 | |||
| 66 | csum_disk = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, | ||
| 67 | size_of_block - sizeof(__le32), | ||
| 68 | CSUM_XOR)); | ||
| 69 | if (csum_disk != bh_le->csum) { | ||
| 70 | DMERR_LIMIT("array_block_check failed: csum %u != wanted %u", | ||
| 71 | (unsigned) le32_to_cpu(csum_disk), | ||
| 72 | (unsigned) le32_to_cpu(bh_le->csum)); | ||
| 73 | return -EILSEQ; | ||
| 74 | } | ||
| 75 | |||
| 76 | return 0; | ||
| 77 | } | ||
| 78 | |||
| 79 | static struct dm_block_validator array_validator = { | ||
| 80 | .name = "array", | ||
| 81 | .prepare_for_write = array_block_prepare_for_write, | ||
| 82 | .check = array_block_check | ||
| 83 | }; | ||
| 84 | |||
| 85 | /*----------------------------------------------------------------*/ | ||
| 86 | |||
| 87 | /* | ||
| 88 | * Functions for manipulating the array blocks. | ||
| 89 | */ | ||
| 90 | |||
| 91 | /* | ||
| 92 | * Returns a pointer to a value within an array block. | ||
| 93 | * | ||
| 94 | * index - The index into _this_ specific block. | ||
| 95 | */ | ||
| 96 | static void *element_at(struct dm_array_info *info, struct array_block *ab, | ||
| 97 | unsigned index) | ||
| 98 | { | ||
| 99 | unsigned char *entry = (unsigned char *) (ab + 1); | ||
| 100 | |||
| 101 | entry += index * info->value_type.size; | ||
| 102 | |||
| 103 | return entry; | ||
| 104 | } | ||
| 105 | |||
| 106 | /* | ||
| 107 | * Utility function that calls one of the value_type methods on every value | ||
| 108 | * in an array block. | ||
| 109 | */ | ||
| 110 | static void on_entries(struct dm_array_info *info, struct array_block *ab, | ||
| 111 | void (*fn)(void *, const void *)) | ||
| 112 | { | ||
| 113 | unsigned i, nr_entries = le32_to_cpu(ab->nr_entries); | ||
| 114 | |||
| 115 | for (i = 0; i < nr_entries; i++) | ||
| 116 | fn(info->value_type.context, element_at(info, ab, i)); | ||
| 117 | } | ||
| 118 | |||
| 119 | /* | ||
| 120 | * Increment every value in an array block. | ||
| 121 | */ | ||
| 122 | static void inc_ablock_entries(struct dm_array_info *info, struct array_block *ab) | ||
| 123 | { | ||
| 124 | struct dm_btree_value_type *vt = &info->value_type; | ||
| 125 | |||
| 126 | if (vt->inc) | ||
| 127 | on_entries(info, ab, vt->inc); | ||
| 128 | } | ||
| 129 | |||
| 130 | /* | ||
| 131 | * Decrement every value in an array block. | ||
| 132 | */ | ||
| 133 | static void dec_ablock_entries(struct dm_array_info *info, struct array_block *ab) | ||
| 134 | { | ||
| 135 | struct dm_btree_value_type *vt = &info->value_type; | ||
| 136 | |||
| 137 | if (vt->dec) | ||
| 138 | on_entries(info, ab, vt->dec); | ||
| 139 | } | ||
| 140 | |||
| 141 | /* | ||
| 142 | * Each array block can hold this many values. | ||
| 143 | */ | ||
| 144 | static uint32_t calc_max_entries(size_t value_size, size_t size_of_block) | ||
| 145 | { | ||
| 146 | return (size_of_block - sizeof(struct array_block)) / value_size; | ||
| 147 | } | ||
| 148 | |||
| 149 | /* | ||
| 150 | * Allocate a new array block. The caller will need to unlock block. | ||
| 151 | */ | ||
| 152 | static int alloc_ablock(struct dm_array_info *info, size_t size_of_block, | ||
| 153 | uint32_t max_entries, | ||
| 154 | struct dm_block **block, struct array_block **ab) | ||
| 155 | { | ||
| 156 | int r; | ||
| 157 | |||
| 158 | r = dm_tm_new_block(info->btree_info.tm, &array_validator, block); | ||
| 159 | if (r) | ||
| 160 | return r; | ||
| 161 | |||
| 162 | (*ab) = dm_block_data(*block); | ||
| 163 | (*ab)->max_entries = cpu_to_le32(max_entries); | ||
| 164 | (*ab)->nr_entries = cpu_to_le32(0); | ||
| 165 | (*ab)->value_size = cpu_to_le32(info->value_type.size); | ||
| 166 | |||
| 167 | return 0; | ||
| 168 | } | ||
| 169 | |||
| 170 | /* | ||
| 171 | * Pad an array block out with a particular value. Every instance will | ||
| 172 | * cause an increment of the value_type. new_nr must always be more than | ||
| 173 | * the current number of entries. | ||
| 174 | */ | ||
| 175 | static void fill_ablock(struct dm_array_info *info, struct array_block *ab, | ||
| 176 | const void *value, unsigned new_nr) | ||
| 177 | { | ||
| 178 | unsigned i; | ||
| 179 | uint32_t nr_entries; | ||
| 180 | struct dm_btree_value_type *vt = &info->value_type; | ||
| 181 | |||
| 182 | BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); | ||
| 183 | BUG_ON(new_nr < le32_to_cpu(ab->nr_entries)); | ||
| 184 | |||
| 185 | nr_entries = le32_to_cpu(ab->nr_entries); | ||
| 186 | for (i = nr_entries; i < new_nr; i++) { | ||
| 187 | if (vt->inc) | ||
| 188 | vt->inc(vt->context, value); | ||
| 189 | memcpy(element_at(info, ab, i), value, vt->size); | ||
| 190 | } | ||
| 191 | ab->nr_entries = cpu_to_le32(new_nr); | ||
| 192 | } | ||
| 193 | |||
| 194 | /* | ||
| 195 | * Remove some entries from the back of an array block. Every value | ||
| 196 | * removed will be decremented. new_nr must be <= the current number of | ||
| 197 | * entries. | ||
| 198 | */ | ||
| 199 | static void trim_ablock(struct dm_array_info *info, struct array_block *ab, | ||
| 200 | unsigned new_nr) | ||
| 201 | { | ||
| 202 | unsigned i; | ||
| 203 | uint32_t nr_entries; | ||
| 204 | struct dm_btree_value_type *vt = &info->value_type; | ||
| 205 | |||
| 206 | BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); | ||
| 207 | BUG_ON(new_nr > le32_to_cpu(ab->nr_entries)); | ||
| 208 | |||
| 209 | nr_entries = le32_to_cpu(ab->nr_entries); | ||
| 210 | for (i = nr_entries; i > new_nr; i--) | ||
| 211 | if (vt->dec) | ||
| 212 | vt->dec(vt->context, element_at(info, ab, i - 1)); | ||
| 213 | ab->nr_entries = cpu_to_le32(new_nr); | ||
| 214 | } | ||
| 215 | |||
| 216 | /* | ||
| 217 | * Read locks a block, and coerces it to an array block. The caller must | ||
| 218 | * unlock 'block' when finished. | ||
| 219 | */ | ||
| 220 | static int get_ablock(struct dm_array_info *info, dm_block_t b, | ||
| 221 | struct dm_block **block, struct array_block **ab) | ||
| 222 | { | ||
| 223 | int r; | ||
| 224 | |||
| 225 | r = dm_tm_read_lock(info->btree_info.tm, b, &array_validator, block); | ||
| 226 | if (r) | ||
| 227 | return r; | ||
| 228 | |||
| 229 | *ab = dm_block_data(*block); | ||
| 230 | return 0; | ||
| 231 | } | ||
| 232 | |||
| 233 | /* | ||
| 234 | * Unlocks an array block. | ||
| 235 | */ | ||
| 236 | static int unlock_ablock(struct dm_array_info *info, struct dm_block *block) | ||
| 237 | { | ||
| 238 | return dm_tm_unlock(info->btree_info.tm, block); | ||
| 239 | } | ||
| 240 | |||
| 241 | /*----------------------------------------------------------------*/ | ||
| 242 | |||
| 243 | /* | ||
| 244 | * Btree manipulation. | ||
| 245 | */ | ||
| 246 | |||
| 247 | /* | ||
| 248 | * Looks up an array block in the btree, and then read locks it. | ||
| 249 | * | ||
| 250 | * index is the index of the index of the array_block, (ie. the array index | ||
| 251 | * / max_entries). | ||
| 252 | */ | ||
| 253 | static int lookup_ablock(struct dm_array_info *info, dm_block_t root, | ||
| 254 | unsigned index, struct dm_block **block, | ||
| 255 | struct array_block **ab) | ||
| 256 | { | ||
| 257 | int r; | ||
| 258 | uint64_t key = index; | ||
| 259 | __le64 block_le; | ||
| 260 | |||
| 261 | r = dm_btree_lookup(&info->btree_info, root, &key, &block_le); | ||
| 262 | if (r) | ||
| 263 | return r; | ||
| 264 | |||
| 265 | return get_ablock(info, le64_to_cpu(block_le), block, ab); | ||
| 266 | } | ||
| 267 | |||
| 268 | /* | ||
| 269 | * Insert an array block into the btree. The block is _not_ unlocked. | ||
| 270 | */ | ||
| 271 | static int insert_ablock(struct dm_array_info *info, uint64_t index, | ||
| 272 | struct dm_block *block, dm_block_t *root) | ||
| 273 | { | ||
| 274 | __le64 block_le = cpu_to_le64(dm_block_location(block)); | ||
| 275 | |||
| 276 | __dm_bless_for_disk(block_le); | ||
| 277 | return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root); | ||
| 278 | } | ||
| 279 | |||
| 280 | /* | ||
| 281 | * Looks up an array block in the btree. Then shadows it, and updates the | ||
| 282 | * btree to point to this new shadow. 'root' is an input/output parameter | ||
| 283 | * for both the current root block, and the new one. | ||
| 284 | */ | ||
| 285 | static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, | ||
| 286 | unsigned index, struct dm_block **block, | ||
| 287 | struct array_block **ab) | ||
| 288 | { | ||
| 289 | int r, inc; | ||
| 290 | uint64_t key = index; | ||
| 291 | dm_block_t b; | ||
| 292 | __le64 block_le; | ||
| 293 | |||
| 294 | /* | ||
| 295 | * lookup | ||
| 296 | */ | ||
| 297 | r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le); | ||
| 298 | if (r) | ||
| 299 | return r; | ||
| 300 | b = le64_to_cpu(block_le); | ||
| 301 | |||
| 302 | /* | ||
| 303 | * shadow | ||
| 304 | */ | ||
| 305 | r = dm_tm_shadow_block(info->btree_info.tm, b, | ||
| 306 | &array_validator, block, &inc); | ||
| 307 | if (r) | ||
| 308 | return r; | ||
| 309 | |||
| 310 | *ab = dm_block_data(*block); | ||
| 311 | if (inc) | ||
| 312 | inc_ablock_entries(info, *ab); | ||
| 313 | |||
| 314 | /* | ||
| 315 | * Reinsert. | ||
| 316 | * | ||
| 317 | * The shadow op will often be a noop. Only insert if it really | ||
| 318 | * copied data. | ||
| 319 | */ | ||
| 320 | if (dm_block_location(*block) != b) | ||
| 321 | r = insert_ablock(info, index, *block, root); | ||
| 322 | |||
| 323 | return r; | ||
| 324 | } | ||
| 325 | |||
| 326 | /* | ||
| 327 | * Allocate an new array block, and fill it with some values. | ||
| 328 | */ | ||
| 329 | static int insert_new_ablock(struct dm_array_info *info, size_t size_of_block, | ||
| 330 | uint32_t max_entries, | ||
| 331 | unsigned block_index, uint32_t nr, | ||
| 332 | const void *value, dm_block_t *root) | ||
| 333 | { | ||
| 334 | int r; | ||
| 335 | struct dm_block *block; | ||
| 336 | struct array_block *ab; | ||
| 337 | |||
| 338 | r = alloc_ablock(info, size_of_block, max_entries, &block, &ab); | ||
| 339 | if (r) | ||
| 340 | return r; | ||
| 341 | |||
| 342 | fill_ablock(info, ab, value, nr); | ||
| 343 | r = insert_ablock(info, block_index, block, root); | ||
| 344 | unlock_ablock(info, block); | ||
| 345 | |||
| 346 | return r; | ||
| 347 | } | ||
| 348 | |||
| 349 | static int insert_full_ablocks(struct dm_array_info *info, size_t size_of_block, | ||
| 350 | unsigned begin_block, unsigned end_block, | ||
| 351 | unsigned max_entries, const void *value, | ||
| 352 | dm_block_t *root) | ||
| 353 | { | ||
| 354 | int r = 0; | ||
| 355 | |||
| 356 | for (; !r && begin_block != end_block; begin_block++) | ||
| 357 | r = insert_new_ablock(info, size_of_block, max_entries, begin_block, max_entries, value, root); | ||
| 358 | |||
| 359 | return r; | ||
| 360 | } | ||
| 361 | |||
| 362 | /* | ||
| 363 | * There are a bunch of functions involved with resizing an array. This | ||
| 364 | * structure holds information that commonly needed by them. Purely here | ||
| 365 | * to reduce parameter count. | ||
| 366 | */ | ||
| 367 | struct resize { | ||
| 368 | /* | ||
| 369 | * Describes the array. | ||
| 370 | */ | ||
| 371 | struct dm_array_info *info; | ||
| 372 | |||
| 373 | /* | ||
| 374 | * The current root of the array. This gets updated. | ||
| 375 | */ | ||
| 376 | dm_block_t root; | ||
| 377 | |||
| 378 | /* | ||
| 379 | * Metadata block size. Used to calculate the nr entries in an | ||
| 380 | * array block. | ||
| 381 | */ | ||
| 382 | size_t size_of_block; | ||
| 383 | |||
| 384 | /* | ||
| 385 | * Maximum nr entries in an array block. | ||
| 386 | */ | ||
| 387 | unsigned max_entries; | ||
| 388 | |||
| 389 | /* | ||
| 390 | * nr of completely full blocks in the array. | ||
| 391 | * | ||
| 392 | * 'old' refers to before the resize, 'new' after. | ||
| 393 | */ | ||
| 394 | unsigned old_nr_full_blocks, new_nr_full_blocks; | ||
| 395 | |||
| 396 | /* | ||
| 397 | * Number of entries in the final block. 0 iff only full blocks in | ||
| 398 | * the array. | ||
| 399 | */ | ||
| 400 | unsigned old_nr_entries_in_last_block, new_nr_entries_in_last_block; | ||
| 401 | |||
| 402 | /* | ||
| 403 | * The default value used when growing the array. | ||
| 404 | */ | ||
| 405 | const void *value; | ||
| 406 | }; | ||
| 407 | |||
| 408 | /* | ||
| 409 | * Removes a consecutive set of array blocks from the btree. The values | ||
| 410 | * in block are decremented as a side effect of the btree remove. | ||
| 411 | * | ||
| 412 | * begin_index - the index of the first array block to remove. | ||
| 413 | * end_index - the one-past-the-end value. ie. this block is not removed. | ||
| 414 | */ | ||
| 415 | static int drop_blocks(struct resize *resize, unsigned begin_index, | ||
| 416 | unsigned end_index) | ||
| 417 | { | ||
| 418 | int r; | ||
| 419 | |||
| 420 | while (begin_index != end_index) { | ||
| 421 | uint64_t key = begin_index++; | ||
| 422 | r = dm_btree_remove(&resize->info->btree_info, resize->root, | ||
| 423 | &key, &resize->root); | ||
| 424 | if (r) | ||
| 425 | return r; | ||
| 426 | } | ||
| 427 | |||
| 428 | return 0; | ||
| 429 | } | ||
| 430 | |||
| 431 | /* | ||
| 432 | * Calculates how many blocks are needed for the array. | ||
| 433 | */ | ||
| 434 | static unsigned total_nr_blocks_needed(unsigned nr_full_blocks, | ||
| 435 | unsigned nr_entries_in_last_block) | ||
| 436 | { | ||
| 437 | return nr_full_blocks + (nr_entries_in_last_block ? 1 : 0); | ||
| 438 | } | ||
| 439 | |||
| 440 | /* | ||
| 441 | * Shrink an array. | ||
| 442 | */ | ||
| 443 | static int shrink(struct resize *resize) | ||
| 444 | { | ||
| 445 | int r; | ||
| 446 | unsigned begin, end; | ||
| 447 | struct dm_block *block; | ||
| 448 | struct array_block *ab; | ||
| 449 | |||
| 450 | /* | ||
| 451 | * Lose some blocks from the back? | ||
| 452 | */ | ||
| 453 | if (resize->new_nr_full_blocks < resize->old_nr_full_blocks) { | ||
| 454 | begin = total_nr_blocks_needed(resize->new_nr_full_blocks, | ||
| 455 | resize->new_nr_entries_in_last_block); | ||
| 456 | end = total_nr_blocks_needed(resize->old_nr_full_blocks, | ||
| 457 | resize->old_nr_entries_in_last_block); | ||
| 458 | |||
| 459 | r = drop_blocks(resize, begin, end); | ||
| 460 | if (r) | ||
| 461 | return r; | ||
| 462 | } | ||
| 463 | |||
| 464 | /* | ||
| 465 | * Trim the new tail block | ||
| 466 | */ | ||
| 467 | if (resize->new_nr_entries_in_last_block) { | ||
| 468 | r = shadow_ablock(resize->info, &resize->root, | ||
| 469 | resize->new_nr_full_blocks, &block, &ab); | ||
| 470 | if (r) | ||
| 471 | return r; | ||
| 472 | |||
| 473 | trim_ablock(resize->info, ab, resize->new_nr_entries_in_last_block); | ||
| 474 | unlock_ablock(resize->info, block); | ||
| 475 | } | ||
| 476 | |||
| 477 | return 0; | ||
| 478 | } | ||
| 479 | |||
| 480 | /* | ||
| 481 | * Grow an array. | ||
| 482 | */ | ||
| 483 | static int grow_extend_tail_block(struct resize *resize, uint32_t new_nr_entries) | ||
| 484 | { | ||
| 485 | int r; | ||
| 486 | struct dm_block *block; | ||
| 487 | struct array_block *ab; | ||
| 488 | |||
| 489 | r = shadow_ablock(resize->info, &resize->root, | ||
| 490 | resize->old_nr_full_blocks, &block, &ab); | ||
| 491 | if (r) | ||
| 492 | return r; | ||
| 493 | |||
| 494 | fill_ablock(resize->info, ab, resize->value, new_nr_entries); | ||
| 495 | unlock_ablock(resize->info, block); | ||
| 496 | |||
| 497 | return r; | ||
| 498 | } | ||
| 499 | |||
| 500 | static int grow_add_tail_block(struct resize *resize) | ||
| 501 | { | ||
| 502 | return insert_new_ablock(resize->info, resize->size_of_block, | ||
| 503 | resize->max_entries, | ||
| 504 | resize->new_nr_full_blocks, | ||
| 505 | resize->new_nr_entries_in_last_block, | ||
| 506 | resize->value, &resize->root); | ||
| 507 | } | ||
| 508 | |||
| 509 | static int grow_needs_more_blocks(struct resize *resize) | ||
| 510 | { | ||
| 511 | int r; | ||
| 512 | |||
| 513 | if (resize->old_nr_entries_in_last_block > 0) { | ||
| 514 | r = grow_extend_tail_block(resize, resize->max_entries); | ||
| 515 | if (r) | ||
| 516 | return r; | ||
| 517 | } | ||
| 518 | |||
| 519 | r = insert_full_ablocks(resize->info, resize->size_of_block, | ||
| 520 | resize->old_nr_full_blocks, | ||
| 521 | resize->new_nr_full_blocks, | ||
| 522 | resize->max_entries, resize->value, | ||
| 523 | &resize->root); | ||
| 524 | if (r) | ||
| 525 | return r; | ||
| 526 | |||
| 527 | if (resize->new_nr_entries_in_last_block) | ||
| 528 | r = grow_add_tail_block(resize); | ||
| 529 | |||
| 530 | return r; | ||
| 531 | } | ||
| 532 | |||
| 533 | static int grow(struct resize *resize) | ||
| 534 | { | ||
| 535 | if (resize->new_nr_full_blocks > resize->old_nr_full_blocks) | ||
| 536 | return grow_needs_more_blocks(resize); | ||
| 537 | |||
| 538 | else if (resize->old_nr_entries_in_last_block) | ||
| 539 | return grow_extend_tail_block(resize, resize->new_nr_entries_in_last_block); | ||
| 540 | |||
| 541 | else | ||
| 542 | return grow_add_tail_block(resize); | ||
| 543 | } | ||
| 544 | |||
| 545 | /*----------------------------------------------------------------*/ | ||
| 546 | |||
| 547 | /* | ||
| 548 | * These are the value_type functions for the btree elements, which point | ||
| 549 | * to array blocks. | ||
| 550 | */ | ||
| 551 | static void block_inc(void *context, const void *value) | ||
| 552 | { | ||
| 553 | __le64 block_le; | ||
| 554 | struct dm_array_info *info = context; | ||
| 555 | |||
| 556 | memcpy(&block_le, value, sizeof(block_le)); | ||
| 557 | dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le)); | ||
| 558 | } | ||
| 559 | |||
| 560 | static void block_dec(void *context, const void *value) | ||
| 561 | { | ||
| 562 | int r; | ||
| 563 | uint64_t b; | ||
| 564 | __le64 block_le; | ||
| 565 | uint32_t ref_count; | ||
| 566 | struct dm_block *block; | ||
| 567 | struct array_block *ab; | ||
| 568 | struct dm_array_info *info = context; | ||
| 569 | |||
| 570 | memcpy(&block_le, value, sizeof(block_le)); | ||
| 571 | b = le64_to_cpu(block_le); | ||
| 572 | |||
| 573 | r = dm_tm_ref(info->btree_info.tm, b, &ref_count); | ||
| 574 | if (r) { | ||
| 575 | DMERR_LIMIT("couldn't get reference count for block %llu", | ||
| 576 | (unsigned long long) b); | ||
| 577 | return; | ||
| 578 | } | ||
| 579 | |||
| 580 | if (ref_count == 1) { | ||
| 581 | /* | ||
| 582 | * We're about to drop the last reference to this ablock. | ||
| 583 | * So we need to decrement the ref count of the contents. | ||
| 584 | */ | ||
| 585 | r = get_ablock(info, b, &block, &ab); | ||
| 586 | if (r) { | ||
| 587 | DMERR_LIMIT("couldn't get array block %llu", | ||
| 588 | (unsigned long long) b); | ||
| 589 | return; | ||
| 590 | } | ||
| 591 | |||
| 592 | dec_ablock_entries(info, ab); | ||
| 593 | unlock_ablock(info, block); | ||
| 594 | } | ||
| 595 | |||
| 596 | dm_tm_dec(info->btree_info.tm, b); | ||
| 597 | } | ||
| 598 | |||
| 599 | static int block_equal(void *context, const void *value1, const void *value2) | ||
| 600 | { | ||
| 601 | return !memcmp(value1, value2, sizeof(__le64)); | ||
| 602 | } | ||
| 603 | |||
| 604 | /*----------------------------------------------------------------*/ | ||
| 605 | |||
| 606 | void dm_array_info_init(struct dm_array_info *info, | ||
| 607 | struct dm_transaction_manager *tm, | ||
| 608 | struct dm_btree_value_type *vt) | ||
| 609 | { | ||
| 610 | struct dm_btree_value_type *bvt = &info->btree_info.value_type; | ||
| 611 | |||
| 612 | memcpy(&info->value_type, vt, sizeof(info->value_type)); | ||
| 613 | info->btree_info.tm = tm; | ||
| 614 | info->btree_info.levels = 1; | ||
| 615 | |||
| 616 | bvt->context = info; | ||
| 617 | bvt->size = sizeof(__le64); | ||
| 618 | bvt->inc = block_inc; | ||
| 619 | bvt->dec = block_dec; | ||
| 620 | bvt->equal = block_equal; | ||
| 621 | } | ||
| 622 | EXPORT_SYMBOL_GPL(dm_array_info_init); | ||
| 623 | |||
| 624 | int dm_array_empty(struct dm_array_info *info, dm_block_t *root) | ||
| 625 | { | ||
| 626 | return dm_btree_empty(&info->btree_info, root); | ||
| 627 | } | ||
| 628 | EXPORT_SYMBOL_GPL(dm_array_empty); | ||
| 629 | |||
| 630 | static int array_resize(struct dm_array_info *info, dm_block_t root, | ||
| 631 | uint32_t old_size, uint32_t new_size, | ||
| 632 | const void *value, dm_block_t *new_root) | ||
| 633 | { | ||
| 634 | int r; | ||
| 635 | struct resize resize; | ||
| 636 | |||
| 637 | if (old_size == new_size) | ||
| 638 | return 0; | ||
| 639 | |||
| 640 | resize.info = info; | ||
| 641 | resize.root = root; | ||
| 642 | resize.size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); | ||
| 643 | resize.max_entries = calc_max_entries(info->value_type.size, | ||
| 644 | resize.size_of_block); | ||
| 645 | |||
| 646 | resize.old_nr_full_blocks = old_size / resize.max_entries; | ||
| 647 | resize.old_nr_entries_in_last_block = old_size % resize.max_entries; | ||
| 648 | resize.new_nr_full_blocks = new_size / resize.max_entries; | ||
| 649 | resize.new_nr_entries_in_last_block = new_size % resize.max_entries; | ||
| 650 | resize.value = value; | ||
| 651 | |||
| 652 | r = ((new_size > old_size) ? grow : shrink)(&resize); | ||
| 653 | if (r) | ||
| 654 | return r; | ||
| 655 | |||
| 656 | *new_root = resize.root; | ||
| 657 | return 0; | ||
| 658 | } | ||
| 659 | |||
| 660 | int dm_array_resize(struct dm_array_info *info, dm_block_t root, | ||
| 661 | uint32_t old_size, uint32_t new_size, | ||
| 662 | const void *value, dm_block_t *new_root) | ||
| 663 | __dm_written_to_disk(value) | ||
| 664 | { | ||
| 665 | int r = array_resize(info, root, old_size, new_size, value, new_root); | ||
| 666 | __dm_unbless_for_disk(value); | ||
| 667 | return r; | ||
| 668 | } | ||
| 669 | EXPORT_SYMBOL_GPL(dm_array_resize); | ||
| 670 | |||
| 671 | int dm_array_del(struct dm_array_info *info, dm_block_t root) | ||
| 672 | { | ||
| 673 | return dm_btree_del(&info->btree_info, root); | ||
| 674 | } | ||
| 675 | EXPORT_SYMBOL_GPL(dm_array_del); | ||
| 676 | |||
| 677 | int dm_array_get_value(struct dm_array_info *info, dm_block_t root, | ||
| 678 | uint32_t index, void *value_le) | ||
| 679 | { | ||
| 680 | int r; | ||
| 681 | struct dm_block *block; | ||
| 682 | struct array_block *ab; | ||
| 683 | size_t size_of_block; | ||
| 684 | unsigned entry, max_entries; | ||
| 685 | |||
| 686 | size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); | ||
| 687 | max_entries = calc_max_entries(info->value_type.size, size_of_block); | ||
| 688 | |||
| 689 | r = lookup_ablock(info, root, index / max_entries, &block, &ab); | ||
| 690 | if (r) | ||
| 691 | return r; | ||
| 692 | |||
| 693 | entry = index % max_entries; | ||
| 694 | if (entry >= le32_to_cpu(ab->nr_entries)) | ||
| 695 | r = -ENODATA; | ||
| 696 | else | ||
| 697 | memcpy(value_le, element_at(info, ab, entry), | ||
| 698 | info->value_type.size); | ||
| 699 | |||
| 700 | unlock_ablock(info, block); | ||
| 701 | return r; | ||
| 702 | } | ||
| 703 | EXPORT_SYMBOL_GPL(dm_array_get_value); | ||
| 704 | |||
| 705 | static int array_set_value(struct dm_array_info *info, dm_block_t root, | ||
| 706 | uint32_t index, const void *value, dm_block_t *new_root) | ||
| 707 | { | ||
| 708 | int r; | ||
| 709 | struct dm_block *block; | ||
| 710 | struct array_block *ab; | ||
| 711 | size_t size_of_block; | ||
| 712 | unsigned max_entries; | ||
| 713 | unsigned entry; | ||
| 714 | void *old_value; | ||
| 715 | struct dm_btree_value_type *vt = &info->value_type; | ||
| 716 | |||
| 717 | size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); | ||
| 718 | max_entries = calc_max_entries(info->value_type.size, size_of_block); | ||
| 719 | |||
| 720 | r = shadow_ablock(info, &root, index / max_entries, &block, &ab); | ||
| 721 | if (r) | ||
| 722 | return r; | ||
| 723 | *new_root = root; | ||
| 724 | |||
| 725 | entry = index % max_entries; | ||
| 726 | if (entry >= le32_to_cpu(ab->nr_entries)) { | ||
| 727 | r = -ENODATA; | ||
| 728 | goto out; | ||
| 729 | } | ||
| 730 | |||
| 731 | old_value = element_at(info, ab, entry); | ||
| 732 | if (vt->dec && | ||
| 733 | (!vt->equal || !vt->equal(vt->context, old_value, value))) { | ||
| 734 | vt->dec(vt->context, old_value); | ||
| 735 | if (vt->inc) | ||
| 736 | vt->inc(vt->context, value); | ||
| 737 | } | ||
| 738 | |||
| 739 | memcpy(old_value, value, info->value_type.size); | ||
| 740 | |||
| 741 | out: | ||
| 742 | unlock_ablock(info, block); | ||
| 743 | return r; | ||
| 744 | } | ||
| 745 | |||
| 746 | int dm_array_set_value(struct dm_array_info *info, dm_block_t root, | ||
| 747 | uint32_t index, const void *value, dm_block_t *new_root) | ||
| 748 | __dm_written_to_disk(value) | ||
| 749 | { | ||
| 750 | int r; | ||
| 751 | |||
| 752 | r = array_set_value(info, root, index, value, new_root); | ||
| 753 | __dm_unbless_for_disk(value); | ||
| 754 | return r; | ||
| 755 | } | ||
| 756 | EXPORT_SYMBOL_GPL(dm_array_set_value); | ||
| 757 | |||
| 758 | struct walk_info { | ||
| 759 | struct dm_array_info *info; | ||
| 760 | int (*fn)(void *context, uint64_t key, void *leaf); | ||
| 761 | void *context; | ||
| 762 | }; | ||
| 763 | |||
| 764 | static int walk_ablock(void *context, uint64_t *keys, void *leaf) | ||
| 765 | { | ||
| 766 | struct walk_info *wi = context; | ||
| 767 | |||
| 768 | int r; | ||
| 769 | unsigned i; | ||
| 770 | __le64 block_le; | ||
| 771 | unsigned nr_entries, max_entries; | ||
| 772 | struct dm_block *block; | ||
| 773 | struct array_block *ab; | ||
| 774 | |||
| 775 | memcpy(&block_le, leaf, sizeof(block_le)); | ||
| 776 | r = get_ablock(wi->info, le64_to_cpu(block_le), &block, &ab); | ||
| 777 | if (r) | ||
| 778 | return r; | ||
| 779 | |||
| 780 | max_entries = le32_to_cpu(ab->max_entries); | ||
| 781 | nr_entries = le32_to_cpu(ab->nr_entries); | ||
| 782 | for (i = 0; i < nr_entries; i++) { | ||
| 783 | r = wi->fn(wi->context, keys[0] * max_entries + i, | ||
| 784 | element_at(wi->info, ab, i)); | ||
| 785 | |||
| 786 | if (r) | ||
| 787 | break; | ||
| 788 | } | ||
| 789 | |||
| 790 | unlock_ablock(wi->info, block); | ||
| 791 | return r; | ||
| 792 | } | ||
| 793 | |||
| 794 | int dm_array_walk(struct dm_array_info *info, dm_block_t root, | ||
| 795 | int (*fn)(void *, uint64_t key, void *leaf), | ||
| 796 | void *context) | ||
| 797 | { | ||
| 798 | struct walk_info wi; | ||
| 799 | |||
| 800 | wi.info = info; | ||
| 801 | wi.fn = fn; | ||
| 802 | wi.context = context; | ||
| 803 | |||
| 804 | return dm_btree_walk(&info->btree_info, root, walk_ablock, &wi); | ||
| 805 | } | ||
| 806 | EXPORT_SYMBOL_GPL(dm_array_walk); | ||
| 807 | |||
| 808 | /*----------------------------------------------------------------*/ | ||
diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h new file mode 100644 index 000000000000..ea177d6fa58f --- /dev/null +++ b/drivers/md/persistent-data/dm-array.h | |||
| @@ -0,0 +1,166 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * This file is released under the GPL. | ||
| 5 | */ | ||
| 6 | #ifndef _LINUX_DM_ARRAY_H | ||
| 7 | #define _LINUX_DM_ARRAY_H | ||
| 8 | |||
| 9 | #include "dm-btree.h" | ||
| 10 | |||
| 11 | /*----------------------------------------------------------------*/ | ||
| 12 | |||
| 13 | /* | ||
| 14 | * The dm-array is a persistent version of an array. It packs the data | ||
| 15 | * more efficiently than a btree which will result in less disk space use, | ||
| 16 | * and a performance boost. The element get and set operations are still | ||
| 17 | * O(ln(n)), but with a much smaller constant. | ||
| 18 | * | ||
| 19 | * The value type structure is reused from the btree type to support proper | ||
| 20 | * reference counting of values. | ||
| 21 | * | ||
| 22 | * The arrays implicitly know their length, and bounds are checked for | ||
| 23 | * lookups and updated. It doesn't store this in an accessible place | ||
| 24 | * because it would waste a whole metadata block. Make sure you store the | ||
| 25 | * size along with the array root in your encompassing data. | ||
| 26 | * | ||
| 27 | * Array entries are indexed via an unsigned integer starting from zero. | ||
| 28 | * Arrays are not sparse; if you resize an array to have 'n' entries then | ||
| 29 | * 'n - 1' will be the last valid index. | ||
| 30 | * | ||
| 31 | * Typical use: | ||
| 32 | * | ||
| 33 | * a) initialise a dm_array_info structure. This describes the array | ||
| 34 | * values and ties it into a specific transaction manager. It holds no | ||
| 35 | * instance data; the same info can be used for many similar arrays if | ||
| 36 | * you wish. | ||
| 37 | * | ||
| 38 | * b) Get yourself a root. The root is the index of a block of data on the | ||
| 39 | * disk that holds a particular instance of an array. You may have a | ||
| 40 | * pre existing root in your metadata that you wish to use, or you may | ||
| 41 | * want to create a brand new, empty array with dm_array_empty(). | ||
| 42 | * | ||
| 43 | * Like the other data structures in this library, dm_array objects are | ||
| 44 | * immutable between transactions. Update functions will return you the | ||
| 45 | * root for a _new_ array. If you've incremented the old root, via | ||
| 46 | * dm_tm_inc(), before calling the update function you may continue to use | ||
| 47 | * it in parallel with the new root. | ||
| 48 | * | ||
| 49 | * c) resize an array with dm_array_resize(). | ||
| 50 | * | ||
| 51 | * d) Get a value from the array with dm_array_get_value(). | ||
| 52 | * | ||
| 53 | * e) Set a value in the array with dm_array_set_value(). | ||
| 54 | * | ||
| 55 | * f) Walk an array of values in index order with dm_array_walk(). More | ||
| 56 | * efficient than making many calls to dm_array_get_value(). | ||
| 57 | * | ||
| 58 | * g) Destroy the array with dm_array_del(). This tells the transaction | ||
| 59 | * manager that you're no longer using this data structure so it can | ||
| 60 | * recycle it's blocks. (dm_array_dec() would be a better name for it, | ||
| 61 | * but del is in keeping with dm_btree_del()). | ||
| 62 | */ | ||
| 63 | |||
| 64 | /* | ||
| 65 | * Describes an array. Don't initialise this structure yourself, use the | ||
| 66 | * init function below. | ||
| 67 | */ | ||
| 68 | struct dm_array_info { | ||
| 69 | struct dm_transaction_manager *tm; | ||
| 70 | struct dm_btree_value_type value_type; | ||
| 71 | struct dm_btree_info btree_info; | ||
| 72 | }; | ||
| 73 | |||
| 74 | /* | ||
| 75 | * Sets up a dm_array_info structure. You don't need to do anything with | ||
| 76 | * this structure when you finish using it. | ||
| 77 | * | ||
| 78 | * info - the structure being filled in. | ||
| 79 | * tm - the transaction manager that should supervise this structure. | ||
| 80 | * vt - describes the leaf values. | ||
| 81 | */ | ||
| 82 | void dm_array_info_init(struct dm_array_info *info, | ||
| 83 | struct dm_transaction_manager *tm, | ||
| 84 | struct dm_btree_value_type *vt); | ||
| 85 | |||
| 86 | /* | ||
| 87 | * Create an empty, zero length array. | ||
| 88 | * | ||
| 89 | * info - describes the array | ||
| 90 | * root - on success this will be filled out with the root block | ||
| 91 | */ | ||
| 92 | int dm_array_empty(struct dm_array_info *info, dm_block_t *root); | ||
| 93 | |||
| 94 | /* | ||
| 95 | * Resizes the array. | ||
| 96 | * | ||
| 97 | * info - describes the array | ||
| 98 | * root - the root block of the array on disk | ||
| 99 | * old_size - the caller is responsible for remembering the size of | ||
| 100 | * the array | ||
| 101 | * new_size - can be bigger or smaller than old_size | ||
| 102 | * value - if we're growing the array the new entries will have this value | ||
| 103 | * new_root - on success, points to the new root block | ||
| 104 | * | ||
| 105 | * If growing the inc function for 'value' will be called the appropriate | ||
| 106 | * number of times. So if the caller is holding a reference they may want | ||
| 107 | * to drop it. | ||
| 108 | */ | ||
| 109 | int dm_array_resize(struct dm_array_info *info, dm_block_t root, | ||
| 110 | uint32_t old_size, uint32_t new_size, | ||
| 111 | const void *value, dm_block_t *new_root) | ||
| 112 | __dm_written_to_disk(value); | ||
| 113 | |||
| 114 | /* | ||
| 115 | * Frees a whole array. The value_type's decrement operation will be called | ||
| 116 | * for all values in the array | ||
| 117 | */ | ||
| 118 | int dm_array_del(struct dm_array_info *info, dm_block_t root); | ||
| 119 | |||
| 120 | /* | ||
| 121 | * Lookup a value in the array | ||
| 122 | * | ||
| 123 | * info - describes the array | ||
| 124 | * root - root block of the array | ||
| 125 | * index - array index | ||
| 126 | * value - the value to be read. Will be in on-disk format of course. | ||
| 127 | * | ||
| 128 | * -ENODATA will be returned if the index is out of bounds. | ||
| 129 | */ | ||
| 130 | int dm_array_get_value(struct dm_array_info *info, dm_block_t root, | ||
| 131 | uint32_t index, void *value); | ||
| 132 | |||
| 133 | /* | ||
| 134 | * Set an entry in the array. | ||
| 135 | * | ||
| 136 | * info - describes the array | ||
| 137 | * root - root block of the array | ||
| 138 | * index - array index | ||
| 139 | * value - value to be written to disk. Make sure you confirm the value is | ||
| 140 | * in on-disk format with__dm_bless_for_disk() before calling. | ||
| 141 | * new_root - the new root block | ||
| 142 | * | ||
| 143 | * The old value being overwritten will be decremented, the new value | ||
| 144 | * incremented. | ||
| 145 | * | ||
| 146 | * -ENODATA will be returned if the index is out of bounds. | ||
| 147 | */ | ||
| 148 | int dm_array_set_value(struct dm_array_info *info, dm_block_t root, | ||
| 149 | uint32_t index, const void *value, dm_block_t *new_root) | ||
| 150 | __dm_written_to_disk(value); | ||
| 151 | |||
| 152 | /* | ||
| 153 | * Walk through all the entries in an array. | ||
| 154 | * | ||
| 155 | * info - describes the array | ||
| 156 | * root - root block of the array | ||
| 157 | * fn - called back for every element | ||
| 158 | * context - passed to the callback | ||
| 159 | */ | ||
| 160 | int dm_array_walk(struct dm_array_info *info, dm_block_t root, | ||
| 161 | int (*fn)(void *context, uint64_t key, void *leaf), | ||
| 162 | void *context); | ||
| 163 | |||
| 164 | /*----------------------------------------------------------------*/ | ||
| 165 | |||
| 166 | #endif /* _LINUX_DM_ARRAY_H */ | ||
diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c new file mode 100644 index 000000000000..cd9a86d4cdf0 --- /dev/null +++ b/drivers/md/persistent-data/dm-bitset.c | |||
| @@ -0,0 +1,163 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * This file is released under the GPL. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include "dm-bitset.h" | ||
| 8 | #include "dm-transaction-manager.h" | ||
| 9 | |||
| 10 | #include <linux/export.h> | ||
| 11 | #include <linux/device-mapper.h> | ||
| 12 | |||
| 13 | #define DM_MSG_PREFIX "bitset" | ||
| 14 | #define BITS_PER_ARRAY_ENTRY 64 | ||
| 15 | |||
| 16 | /*----------------------------------------------------------------*/ | ||
| 17 | |||
| 18 | static struct dm_btree_value_type bitset_bvt = { | ||
| 19 | .context = NULL, | ||
| 20 | .size = sizeof(__le64), | ||
| 21 | .inc = NULL, | ||
| 22 | .dec = NULL, | ||
| 23 | .equal = NULL, | ||
| 24 | }; | ||
| 25 | |||
| 26 | /*----------------------------------------------------------------*/ | ||
| 27 | |||
| 28 | void dm_disk_bitset_init(struct dm_transaction_manager *tm, | ||
| 29 | struct dm_disk_bitset *info) | ||
| 30 | { | ||
| 31 | dm_array_info_init(&info->array_info, tm, &bitset_bvt); | ||
| 32 | info->current_index_set = false; | ||
| 33 | } | ||
| 34 | EXPORT_SYMBOL_GPL(dm_disk_bitset_init); | ||
| 35 | |||
| 36 | int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root) | ||
| 37 | { | ||
| 38 | return dm_array_empty(&info->array_info, root); | ||
| 39 | } | ||
| 40 | EXPORT_SYMBOL_GPL(dm_bitset_empty); | ||
| 41 | |||
| 42 | int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root, | ||
| 43 | uint32_t old_nr_entries, uint32_t new_nr_entries, | ||
| 44 | bool default_value, dm_block_t *new_root) | ||
| 45 | { | ||
| 46 | uint32_t old_blocks = dm_div_up(old_nr_entries, BITS_PER_ARRAY_ENTRY); | ||
| 47 | uint32_t new_blocks = dm_div_up(new_nr_entries, BITS_PER_ARRAY_ENTRY); | ||
| 48 | __le64 value = default_value ? cpu_to_le64(~0) : cpu_to_le64(0); | ||
| 49 | |||
| 50 | __dm_bless_for_disk(&value); | ||
| 51 | return dm_array_resize(&info->array_info, root, old_blocks, new_blocks, | ||
| 52 | &value, new_root); | ||
| 53 | } | ||
| 54 | EXPORT_SYMBOL_GPL(dm_bitset_resize); | ||
| 55 | |||
| 56 | int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root) | ||
| 57 | { | ||
| 58 | return dm_array_del(&info->array_info, root); | ||
| 59 | } | ||
| 60 | EXPORT_SYMBOL_GPL(dm_bitset_del); | ||
| 61 | |||
| 62 | int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, | ||
| 63 | dm_block_t *new_root) | ||
| 64 | { | ||
| 65 | int r; | ||
| 66 | __le64 value; | ||
| 67 | |||
| 68 | if (!info->current_index_set) | ||
| 69 | return 0; | ||
| 70 | |||
| 71 | value = cpu_to_le64(info->current_bits); | ||
| 72 | |||
| 73 | __dm_bless_for_disk(&value); | ||
| 74 | r = dm_array_set_value(&info->array_info, root, info->current_index, | ||
| 75 | &value, new_root); | ||
| 76 | if (r) | ||
| 77 | return r; | ||
| 78 | |||
| 79 | info->current_index_set = false; | ||
| 80 | return 0; | ||
| 81 | } | ||
| 82 | EXPORT_SYMBOL_GPL(dm_bitset_flush); | ||
| 83 | |||
| 84 | static int read_bits(struct dm_disk_bitset *info, dm_block_t root, | ||
| 85 | uint32_t array_index) | ||
| 86 | { | ||
| 87 | int r; | ||
| 88 | __le64 value; | ||
| 89 | |||
| 90 | r = dm_array_get_value(&info->array_info, root, array_index, &value); | ||
| 91 | if (r) | ||
| 92 | return r; | ||
| 93 | |||
| 94 | info->current_bits = le64_to_cpu(value); | ||
| 95 | info->current_index_set = true; | ||
| 96 | info->current_index = array_index; | ||
| 97 | return 0; | ||
| 98 | } | ||
| 99 | |||
| 100 | static int get_array_entry(struct dm_disk_bitset *info, dm_block_t root, | ||
| 101 | uint32_t index, dm_block_t *new_root) | ||
| 102 | { | ||
| 103 | int r; | ||
| 104 | unsigned array_index = index / BITS_PER_ARRAY_ENTRY; | ||
| 105 | |||
| 106 | if (info->current_index_set) { | ||
| 107 | if (info->current_index == array_index) | ||
| 108 | return 0; | ||
| 109 | |||
| 110 | r = dm_bitset_flush(info, root, new_root); | ||
| 111 | if (r) | ||
| 112 | return r; | ||
| 113 | } | ||
| 114 | |||
| 115 | return read_bits(info, root, array_index); | ||
| 116 | } | ||
| 117 | |||
| 118 | int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root, | ||
| 119 | uint32_t index, dm_block_t *new_root) | ||
| 120 | { | ||
| 121 | int r; | ||
| 122 | unsigned b = index % BITS_PER_ARRAY_ENTRY; | ||
| 123 | |||
| 124 | r = get_array_entry(info, root, index, new_root); | ||
| 125 | if (r) | ||
| 126 | return r; | ||
| 127 | |||
| 128 | set_bit(b, (unsigned long *) &info->current_bits); | ||
| 129 | return 0; | ||
| 130 | } | ||
| 131 | EXPORT_SYMBOL_GPL(dm_bitset_set_bit); | ||
| 132 | |||
| 133 | int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root, | ||
| 134 | uint32_t index, dm_block_t *new_root) | ||
| 135 | { | ||
| 136 | int r; | ||
| 137 | unsigned b = index % BITS_PER_ARRAY_ENTRY; | ||
| 138 | |||
| 139 | r = get_array_entry(info, root, index, new_root); | ||
| 140 | if (r) | ||
| 141 | return r; | ||
| 142 | |||
| 143 | clear_bit(b, (unsigned long *) &info->current_bits); | ||
| 144 | return 0; | ||
| 145 | } | ||
| 146 | EXPORT_SYMBOL_GPL(dm_bitset_clear_bit); | ||
| 147 | |||
| 148 | int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, | ||
| 149 | uint32_t index, dm_block_t *new_root, bool *result) | ||
| 150 | { | ||
| 151 | int r; | ||
| 152 | unsigned b = index % BITS_PER_ARRAY_ENTRY; | ||
| 153 | |||
| 154 | r = get_array_entry(info, root, index, new_root); | ||
| 155 | if (r) | ||
| 156 | return r; | ||
| 157 | |||
| 158 | *result = test_bit(b, (unsigned long *) &info->current_bits); | ||
| 159 | return 0; | ||
| 160 | } | ||
| 161 | EXPORT_SYMBOL_GPL(dm_bitset_test_bit); | ||
| 162 | |||
| 163 | /*----------------------------------------------------------------*/ | ||
diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h new file mode 100644 index 000000000000..e1b9bea14aa1 --- /dev/null +++ b/drivers/md/persistent-data/dm-bitset.h | |||
| @@ -0,0 +1,165 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Red Hat, Inc. | ||
| 3 | * | ||
| 4 | * This file is released under the GPL. | ||
| 5 | */ | ||
| 6 | #ifndef _LINUX_DM_BITSET_H | ||
| 7 | #define _LINUX_DM_BITSET_H | ||
| 8 | |||
| 9 | #include "dm-array.h" | ||
| 10 | |||
| 11 | /*----------------------------------------------------------------*/ | ||
| 12 | |||
| 13 | /* | ||
| 14 | * This bitset type is a thin wrapper round a dm_array of 64bit words. It | ||
| 15 | * uses a tiny, one word cache to reduce the number of array lookups and so | ||
| 16 | * increase performance. | ||
| 17 | * | ||
| 18 | * Like the dm-array that it's based on, the caller needs to keep track of | ||
| 19 | * the size of the bitset separately. The underlying dm-array implicitly | ||
| 20 | * knows how many words it's storing and will return -ENODATA if you try | ||
| 21 | * and access an out of bounds word. However, an out of bounds bit in the | ||
| 22 | * final word will _not_ be detected, you have been warned. | ||
| 23 | * | ||
| 24 | * Bits are indexed from zero. | ||
| 25 | |||
| 26 | * Typical use: | ||
| 27 | * | ||
| 28 | * a) Initialise a dm_disk_bitset structure with dm_disk_bitset_init(). | ||
| 29 | * This describes the bitset and includes the cache. It's not called it | ||
| 30 | * dm_bitset_info in line with other data structures because it does | ||
| 31 | * include instance data. | ||
| 32 | * | ||
| 33 | * b) Get yourself a root. The root is the index of a block of data on the | ||
| 34 | * disk that holds a particular instance of an bitset. You may have a | ||
| 35 | * pre existing root in your metadata that you wish to use, or you may | ||
| 36 | * want to create a brand new, empty bitset with dm_bitset_empty(). | ||
| 37 | * | ||
| 38 | * Like the other data structures in this library, dm_bitset objects are | ||
| 39 | * immutable between transactions. Update functions will return you the | ||
| 40 | * root for a _new_ array. If you've incremented the old root, via | ||
| 41 | * dm_tm_inc(), before calling the update function you may continue to use | ||
| 42 | * it in parallel with the new root. | ||
| 43 | * | ||
| 44 | * Even read operations may trigger the cache to be flushed and as such | ||
| 45 | * return a root for a new, updated bitset. | ||
| 46 | * | ||
| 47 | * c) resize a bitset with dm_bitset_resize(). | ||
| 48 | * | ||
| 49 | * d) Set a bit with dm_bitset_set_bit(). | ||
| 50 | * | ||
| 51 | * e) Clear a bit with dm_bitset_clear_bit(). | ||
| 52 | * | ||
| 53 | * f) Test a bit with dm_bitset_test_bit(). | ||
| 54 | * | ||
| 55 | * g) Flush all updates from the cache with dm_bitset_flush(). | ||
| 56 | * | ||
| 57 | * h) Destroy the bitset with dm_bitset_del(). This tells the transaction | ||
| 58 | * manager that you're no longer using this data structure so it can | ||
| 59 | * recycle it's blocks. (dm_bitset_dec() would be a better name for it, | ||
| 60 | * but del is in keeping with dm_btree_del()). | ||
| 61 | */ | ||
| 62 | |||
| 63 | /* | ||
| 64 | * Opaque object. Unlike dm_array_info, you should have one of these per | ||
| 65 | * bitset. Initialise with dm_disk_bitset_init(). | ||
| 66 | */ | ||
| 67 | struct dm_disk_bitset { | ||
| 68 | struct dm_array_info array_info; | ||
| 69 | |||
| 70 | uint32_t current_index; | ||
| 71 | uint64_t current_bits; | ||
| 72 | |||
| 73 | bool current_index_set:1; | ||
| 74 | }; | ||
| 75 | |||
| 76 | /* | ||
| 77 | * Sets up a dm_disk_bitset structure. You don't need to do anything with | ||
| 78 | * this structure when you finish using it. | ||
| 79 | * | ||
| 80 | * tm - the transaction manager that should supervise this structure | ||
| 81 | * info - the structure being initialised | ||
| 82 | */ | ||
| 83 | void dm_disk_bitset_init(struct dm_transaction_manager *tm, | ||
| 84 | struct dm_disk_bitset *info); | ||
| 85 | |||
| 86 | /* | ||
| 87 | * Create an empty, zero length bitset. | ||
| 88 | * | ||
| 89 | * info - describes the bitset | ||
| 90 | * new_root - on success, points to the new root block | ||
| 91 | */ | ||
| 92 | int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root); | ||
| 93 | |||
| 94 | /* | ||
| 95 | * Resize the bitset. | ||
| 96 | * | ||
| 97 | * info - describes the bitset | ||
| 98 | * old_root - the root block of the array on disk | ||
| 99 | * old_nr_entries - the number of bits in the old bitset | ||
| 100 | * new_nr_entries - the number of bits you want in the new bitset | ||
| 101 | * default_value - the value for any new bits | ||
| 102 | * new_root - on success, points to the new root block | ||
| 103 | */ | ||
| 104 | int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t old_root, | ||
| 105 | uint32_t old_nr_entries, uint32_t new_nr_entries, | ||
| 106 | bool default_value, dm_block_t *new_root); | ||
| 107 | |||
| 108 | /* | ||
| 109 | * Frees the bitset. | ||
| 110 | */ | ||
| 111 | int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root); | ||
| 112 | |||
| 113 | /* | ||
| 114 | * Set a bit. | ||
| 115 | * | ||
| 116 | * info - describes the bitset | ||
| 117 | * root - the root block of the bitset | ||
| 118 | * index - the bit index | ||
| 119 | * new_root - on success, points to the new root block | ||
| 120 | * | ||
| 121 | * -ENODATA will be returned if the index is out of bounds. | ||
| 122 | */ | ||
| 123 | int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root, | ||
| 124 | uint32_t index, dm_block_t *new_root); | ||
| 125 | |||
| 126 | /* | ||
| 127 | * Clears a bit. | ||
| 128 | * | ||
| 129 | * info - describes the bitset | ||
| 130 | * root - the root block of the bitset | ||
| 131 | * index - the bit index | ||
| 132 | * new_root - on success, points to the new root block | ||
| 133 | * | ||
| 134 | * -ENODATA will be returned if the index is out of bounds. | ||
| 135 | */ | ||
| 136 | int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root, | ||
| 137 | uint32_t index, dm_block_t *new_root); | ||
| 138 | |||
| 139 | /* | ||
| 140 | * Tests a bit. | ||
| 141 | * | ||
| 142 | * info - describes the bitset | ||
| 143 | * root - the root block of the bitset | ||
| 144 | * index - the bit index | ||
| 145 | * new_root - on success, points to the new root block (cached values may have been written) | ||
| 146 | * result - the bit value you're after | ||
| 147 | * | ||
| 148 | * -ENODATA will be returned if the index is out of bounds. | ||
| 149 | */ | ||
| 150 | int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, | ||
| 151 | uint32_t index, dm_block_t *new_root, bool *result); | ||
| 152 | |||
| 153 | /* | ||
| 154 | * Flush any cached changes to disk. | ||
| 155 | * | ||
| 156 | * info - describes the bitset | ||
| 157 | * root - the root block of the bitset | ||
| 158 | * new_root - on success, points to the new root block | ||
| 159 | */ | ||
| 160 | int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, | ||
| 161 | dm_block_t *new_root); | ||
| 162 | |||
| 163 | /*----------------------------------------------------------------*/ | ||
| 164 | |||
| 165 | #endif /* _LINUX_DM_BITSET_H */ | ||
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 28c3ed072a79..81b513890e2b 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c | |||
| @@ -613,6 +613,7 @@ int dm_bm_flush_and_unlock(struct dm_block_manager *bm, | |||
| 613 | 613 | ||
| 614 | return dm_bufio_write_dirty_buffers(bm->bufio); | 614 | return dm_bufio_write_dirty_buffers(bm->bufio); |
| 615 | } | 615 | } |
| 616 | EXPORT_SYMBOL_GPL(dm_bm_flush_and_unlock); | ||
| 616 | 617 | ||
| 617 | void dm_bm_set_read_only(struct dm_block_manager *bm) | 618 | void dm_bm_set_read_only(struct dm_block_manager *bm) |
| 618 | { | 619 | { |
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index accbb05f17b6..37d367bb9aa8 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h | |||
| @@ -64,6 +64,7 @@ struct ro_spine { | |||
| 64 | void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info); | 64 | void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info); |
| 65 | int exit_ro_spine(struct ro_spine *s); | 65 | int exit_ro_spine(struct ro_spine *s); |
| 66 | int ro_step(struct ro_spine *s, dm_block_t new_child); | 66 | int ro_step(struct ro_spine *s, dm_block_t new_child); |
| 67 | void ro_pop(struct ro_spine *s); | ||
| 67 | struct btree_node *ro_node(struct ro_spine *s); | 68 | struct btree_node *ro_node(struct ro_spine *s); |
| 68 | 69 | ||
| 69 | struct shadow_spine { | 70 | struct shadow_spine { |
diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index f199a0c4ed04..cf9fd676ae44 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c | |||
| @@ -164,6 +164,13 @@ int ro_step(struct ro_spine *s, dm_block_t new_child) | |||
| 164 | return r; | 164 | return r; |
| 165 | } | 165 | } |
| 166 | 166 | ||
| 167 | void ro_pop(struct ro_spine *s) | ||
| 168 | { | ||
| 169 | BUG_ON(!s->count); | ||
| 170 | --s->count; | ||
| 171 | unlock_block(s->info, s->nodes[s->count]); | ||
| 172 | } | ||
| 173 | |||
| 167 | struct btree_node *ro_node(struct ro_spine *s) | 174 | struct btree_node *ro_node(struct ro_spine *s) |
| 168 | { | 175 | { |
| 169 | struct dm_block *block; | 176 | struct dm_block *block; |
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 4caf66918cdb..35865425e4b4 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c | |||
| @@ -807,3 +807,55 @@ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, | |||
| 807 | return r ? r : count; | 807 | return r ? r : count; |
| 808 | } | 808 | } |
| 809 | EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); | 809 | EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); |
| 810 | |||
| 811 | /* | ||
| 812 | * FIXME: We shouldn't use a recursive algorithm when we have limited stack | ||
| 813 | * space. Also this only works for single level trees. | ||
| 814 | */ | ||
| 815 | static int walk_node(struct ro_spine *s, dm_block_t block, | ||
| 816 | int (*fn)(void *context, uint64_t *keys, void *leaf), | ||
| 817 | void *context) | ||
| 818 | { | ||
| 819 | int r; | ||
| 820 | unsigned i, nr; | ||
| 821 | struct btree_node *n; | ||
| 822 | uint64_t keys; | ||
| 823 | |||
| 824 | r = ro_step(s, block); | ||
| 825 | n = ro_node(s); | ||
| 826 | |||
| 827 | nr = le32_to_cpu(n->header.nr_entries); | ||
| 828 | for (i = 0; i < nr; i++) { | ||
| 829 | if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { | ||
| 830 | r = walk_node(s, value64(n, i), fn, context); | ||
| 831 | if (r) | ||
| 832 | goto out; | ||
| 833 | } else { | ||
| 834 | keys = le64_to_cpu(*key_ptr(n, i)); | ||
| 835 | r = fn(context, &keys, value_ptr(n, i)); | ||
| 836 | if (r) | ||
| 837 | goto out; | ||
| 838 | } | ||
| 839 | } | ||
| 840 | |||
| 841 | out: | ||
| 842 | ro_pop(s); | ||
| 843 | return r; | ||
| 844 | } | ||
| 845 | |||
| 846 | int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, | ||
| 847 | int (*fn)(void *context, uint64_t *keys, void *leaf), | ||
| 848 | void *context) | ||
| 849 | { | ||
| 850 | int r; | ||
| 851 | struct ro_spine spine; | ||
| 852 | |||
| 853 | BUG_ON(info->levels > 1); | ||
| 854 | |||
| 855 | init_ro_spine(&spine, info); | ||
| 856 | r = walk_node(&spine, root, fn, context); | ||
| 857 | exit_ro_spine(&spine); | ||
| 858 | |||
| 859 | return r; | ||
| 860 | } | ||
| 861 | EXPORT_SYMBOL_GPL(dm_btree_walk); | ||
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h index a2cd50441ca1..8672d159e0b5 100644 --- a/drivers/md/persistent-data/dm-btree.h +++ b/drivers/md/persistent-data/dm-btree.h | |||
| @@ -58,21 +58,21 @@ struct dm_btree_value_type { | |||
| 58 | * somewhere.) This method is _not_ called for insertion of a new | 58 | * somewhere.) This method is _not_ called for insertion of a new |
| 59 | * value: It is assumed the ref count is already 1. | 59 | * value: It is assumed the ref count is already 1. |
| 60 | */ | 60 | */ |
| 61 | void (*inc)(void *context, void *value); | 61 | void (*inc)(void *context, const void *value); |
| 62 | 62 | ||
| 63 | /* | 63 | /* |
| 64 | * This value is being deleted. The btree takes care of freeing | 64 | * This value is being deleted. The btree takes care of freeing |
| 65 | * the memory pointed to by @value. Often the del function just | 65 | * the memory pointed to by @value. Often the del function just |
| 66 | * needs to decrement a reference count somewhere. | 66 | * needs to decrement a reference count somewhere. |
| 67 | */ | 67 | */ |
| 68 | void (*dec)(void *context, void *value); | 68 | void (*dec)(void *context, const void *value); |
| 69 | 69 | ||
| 70 | /* | 70 | /* |
| 71 | * A test for equality between two values. When a value is | 71 | * A test for equality between two values. When a value is |
| 72 | * overwritten with a new one, the old one has the dec method | 72 | * overwritten with a new one, the old one has the dec method |
| 73 | * called _unless_ the new and old value are deemed equal. | 73 | * called _unless_ the new and old value are deemed equal. |
| 74 | */ | 74 | */ |
| 75 | int (*equal)(void *context, void *value1, void *value2); | 75 | int (*equal)(void *context, const void *value1, const void *value2); |
| 76 | }; | 76 | }; |
| 77 | 77 | ||
| 78 | /* | 78 | /* |
| @@ -142,4 +142,13 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, | |||
| 142 | int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, | 142 | int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, |
| 143 | uint64_t *result_keys); | 143 | uint64_t *result_keys); |
| 144 | 144 | ||
| 145 | /* | ||
| 146 | * Iterate through the a btree, calling fn() on each entry. | ||
| 147 | * It only works for single level trees and is internally recursive, so | ||
| 148 | * monitor stack usage carefully. | ||
| 149 | */ | ||
| 150 | int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, | ||
| 151 | int (*fn)(void *context, uint64_t *keys, void *leaf), | ||
| 152 | void *context); | ||
| 153 | |||
| 145 | #endif /* _LINUX_DM_BTREE_H */ | 154 | #endif /* _LINUX_DM_BTREE_H */ |
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index d247a35da3c6..81da1a26042e 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c | |||
| @@ -25,8 +25,8 @@ struct shadow_info { | |||
| 25 | /* | 25 | /* |
| 26 | * It would be nice if we scaled with the size of transaction. | 26 | * It would be nice if we scaled with the size of transaction. |
| 27 | */ | 27 | */ |
| 28 | #define HASH_SIZE 256 | 28 | #define DM_HASH_SIZE 256 |
| 29 | #define HASH_MASK (HASH_SIZE - 1) | 29 | #define DM_HASH_MASK (DM_HASH_SIZE - 1) |
| 30 | 30 | ||
| 31 | struct dm_transaction_manager { | 31 | struct dm_transaction_manager { |
| 32 | int is_clone; | 32 | int is_clone; |
| @@ -36,7 +36,7 @@ struct dm_transaction_manager { | |||
| 36 | struct dm_space_map *sm; | 36 | struct dm_space_map *sm; |
| 37 | 37 | ||
| 38 | spinlock_t lock; | 38 | spinlock_t lock; |
| 39 | struct hlist_head buckets[HASH_SIZE]; | 39 | struct hlist_head buckets[DM_HASH_SIZE]; |
| 40 | }; | 40 | }; |
| 41 | 41 | ||
| 42 | /*----------------------------------------------------------------*/ | 42 | /*----------------------------------------------------------------*/ |
| @@ -44,12 +44,11 @@ struct dm_transaction_manager { | |||
| 44 | static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) | 44 | static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) |
| 45 | { | 45 | { |
| 46 | int r = 0; | 46 | int r = 0; |
| 47 | unsigned bucket = dm_hash_block(b, HASH_MASK); | 47 | unsigned bucket = dm_hash_block(b, DM_HASH_MASK); |
| 48 | struct shadow_info *si; | 48 | struct shadow_info *si; |
| 49 | struct hlist_node *n; | ||
| 50 | 49 | ||
| 51 | spin_lock(&tm->lock); | 50 | spin_lock(&tm->lock); |
| 52 | hlist_for_each_entry(si, n, tm->buckets + bucket, hlist) | 51 | hlist_for_each_entry(si, tm->buckets + bucket, hlist) |
| 53 | if (si->where == b) { | 52 | if (si->where == b) { |
| 54 | r = 1; | 53 | r = 1; |
| 55 | break; | 54 | break; |
| @@ -71,7 +70,7 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b) | |||
| 71 | si = kmalloc(sizeof(*si), GFP_NOIO); | 70 | si = kmalloc(sizeof(*si), GFP_NOIO); |
| 72 | if (si) { | 71 | if (si) { |
| 73 | si->where = b; | 72 | si->where = b; |
| 74 | bucket = dm_hash_block(b, HASH_MASK); | 73 | bucket = dm_hash_block(b, DM_HASH_MASK); |
| 75 | spin_lock(&tm->lock); | 74 | spin_lock(&tm->lock); |
| 76 | hlist_add_head(&si->hlist, tm->buckets + bucket); | 75 | hlist_add_head(&si->hlist, tm->buckets + bucket); |
| 77 | spin_unlock(&tm->lock); | 76 | spin_unlock(&tm->lock); |
| @@ -81,14 +80,14 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b) | |||
| 81 | static void wipe_shadow_table(struct dm_transaction_manager *tm) | 80 | static void wipe_shadow_table(struct dm_transaction_manager *tm) |
| 82 | { | 81 | { |
| 83 | struct shadow_info *si; | 82 | struct shadow_info *si; |
| 84 | struct hlist_node *n, *tmp; | 83 | struct hlist_node *tmp; |
| 85 | struct hlist_head *bucket; | 84 | struct hlist_head *bucket; |
| 86 | int i; | 85 | int i; |
| 87 | 86 | ||
| 88 | spin_lock(&tm->lock); | 87 | spin_lock(&tm->lock); |
| 89 | for (i = 0; i < HASH_SIZE; i++) { | 88 | for (i = 0; i < DM_HASH_SIZE; i++) { |
| 90 | bucket = tm->buckets + i; | 89 | bucket = tm->buckets + i; |
| 91 | hlist_for_each_entry_safe(si, n, tmp, bucket, hlist) | 90 | hlist_for_each_entry_safe(si, tmp, bucket, hlist) |
| 92 | kfree(si); | 91 | kfree(si); |
| 93 | 92 | ||
| 94 | INIT_HLIST_HEAD(bucket); | 93 | INIT_HLIST_HEAD(bucket); |
| @@ -115,7 +114,7 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, | |||
| 115 | tm->sm = sm; | 114 | tm->sm = sm; |
| 116 | 115 | ||
| 117 | spin_lock_init(&tm->lock); | 116 | spin_lock_init(&tm->lock); |
| 118 | for (i = 0; i < HASH_SIZE; i++) | 117 | for (i = 0; i < DM_HASH_SIZE; i++) |
| 119 | INIT_HLIST_HEAD(tm->buckets + i); | 118 | INIT_HLIST_HEAD(tm->buckets + i); |
| 120 | 119 | ||
| 121 | return tm; | 120 | return tm; |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 24b359717a7e..0505452de8d6 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
| @@ -175,7 +175,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
| 175 | rdev1->new_raid_disk = j; | 175 | rdev1->new_raid_disk = j; |
| 176 | } | 176 | } |
| 177 | 177 | ||
| 178 | if (j < 0 || j >= mddev->raid_disks) { | 178 | if (j < 0) { |
| 179 | printk(KERN_ERR | ||
| 180 | "md/raid0:%s: remove inactive devices before converting to RAID0\n", | ||
| 181 | mdname(mddev)); | ||
| 182 | goto abort; | ||
| 183 | } | ||
| 184 | if (j >= mddev->raid_disks) { | ||
| 179 | printk(KERN_ERR "md/raid0:%s: bad disk number %d - " | 185 | printk(KERN_ERR "md/raid0:%s: bad disk number %d - " |
| 180 | "aborting!\n", mdname(mddev), j); | 186 | "aborting!\n", mdname(mddev), j); |
| 181 | goto abort; | 187 | goto abort; |
| @@ -289,7 +295,7 @@ abort: | |||
| 289 | kfree(conf->strip_zone); | 295 | kfree(conf->strip_zone); |
| 290 | kfree(conf->devlist); | 296 | kfree(conf->devlist); |
| 291 | kfree(conf); | 297 | kfree(conf); |
| 292 | *private_conf = NULL; | 298 | *private_conf = ERR_PTR(err); |
| 293 | return err; | 299 | return err; |
| 294 | } | 300 | } |
| 295 | 301 | ||
| @@ -411,7 +417,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks | |||
| 411 | "%s does not support generic reshape\n", __func__); | 417 | "%s does not support generic reshape\n", __func__); |
| 412 | 418 | ||
| 413 | rdev_for_each(rdev, mddev) | 419 | rdev_for_each(rdev, mddev) |
| 414 | array_sectors += rdev->sectors; | 420 | array_sectors += (rdev->sectors & |
| 421 | ~(sector_t)(mddev->chunk_sectors-1)); | ||
| 415 | 422 | ||
| 416 | return array_sectors; | 423 | return array_sectors; |
| 417 | } | 424 | } |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d5bddfc4010e..fd86b372692d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
| @@ -967,6 +967,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) | |||
| 967 | bio_list_merge(&conf->pending_bio_list, &plug->pending); | 967 | bio_list_merge(&conf->pending_bio_list, &plug->pending); |
| 968 | conf->pending_count += plug->pending_cnt; | 968 | conf->pending_count += plug->pending_cnt; |
| 969 | spin_unlock_irq(&conf->device_lock); | 969 | spin_unlock_irq(&conf->device_lock); |
| 970 | wake_up(&conf->wait_barrier); | ||
| 970 | md_wakeup_thread(mddev->thread); | 971 | md_wakeup_thread(mddev->thread); |
| 971 | kfree(plug); | 972 | kfree(plug); |
| 972 | return; | 973 | return; |
| @@ -1000,6 +1001,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
| 1000 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); | 1001 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); |
| 1001 | const unsigned long do_discard = (bio->bi_rw | 1002 | const unsigned long do_discard = (bio->bi_rw |
| 1002 | & (REQ_DISCARD | REQ_SECURE)); | 1003 | & (REQ_DISCARD | REQ_SECURE)); |
| 1004 | const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME); | ||
| 1003 | struct md_rdev *blocked_rdev; | 1005 | struct md_rdev *blocked_rdev; |
| 1004 | struct blk_plug_cb *cb; | 1006 | struct blk_plug_cb *cb; |
| 1005 | struct raid1_plug_cb *plug = NULL; | 1007 | struct raid1_plug_cb *plug = NULL; |
| @@ -1301,7 +1303,8 @@ read_again: | |||
| 1301 | conf->mirrors[i].rdev->data_offset); | 1303 | conf->mirrors[i].rdev->data_offset); |
| 1302 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 1304 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
| 1303 | mbio->bi_end_io = raid1_end_write_request; | 1305 | mbio->bi_end_io = raid1_end_write_request; |
| 1304 | mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; | 1306 | mbio->bi_rw = |
| 1307 | WRITE | do_flush_fua | do_sync | do_discard | do_same; | ||
| 1305 | mbio->bi_private = r1_bio; | 1308 | mbio->bi_private = r1_bio; |
| 1306 | 1309 | ||
| 1307 | atomic_inc(&r1_bio->remaining); | 1310 | atomic_inc(&r1_bio->remaining); |
| @@ -2818,6 +2821,9 @@ static int run(struct mddev *mddev) | |||
| 2818 | if (IS_ERR(conf)) | 2821 | if (IS_ERR(conf)) |
| 2819 | return PTR_ERR(conf); | 2822 | return PTR_ERR(conf); |
| 2820 | 2823 | ||
| 2824 | if (mddev->queue) | ||
| 2825 | blk_queue_max_write_same_sectors(mddev->queue, | ||
| 2826 | mddev->chunk_sectors); | ||
| 2821 | rdev_for_each(rdev, mddev) { | 2827 | rdev_for_each(rdev, mddev) { |
| 2822 | if (!mddev->gendisk) | 2828 | if (!mddev->gendisk) |
| 2823 | continue; | 2829 | continue; |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 64d48249c03b..77b562d18a90 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
| @@ -38,21 +38,36 @@ | |||
| 38 | * near_copies (stored in low byte of layout) | 38 | * near_copies (stored in low byte of layout) |
| 39 | * far_copies (stored in second byte of layout) | 39 | * far_copies (stored in second byte of layout) |
| 40 | * far_offset (stored in bit 16 of layout ) | 40 | * far_offset (stored in bit 16 of layout ) |
| 41 | * use_far_sets (stored in bit 17 of layout ) | ||
| 41 | * | 42 | * |
| 42 | * The data to be stored is divided into chunks using chunksize. | 43 | * The data to be stored is divided into chunks using chunksize. Each device |
| 43 | * Each device is divided into far_copies sections. | 44 | * is divided into far_copies sections. In each section, chunks are laid out |
| 44 | * In each section, chunks are laid out in a style similar to raid0, but | 45 | * in a style similar to raid0, but near_copies copies of each chunk is stored |
| 45 | * near_copies copies of each chunk is stored (each on a different drive). | 46 | * (each on a different drive). The starting device for each section is offset |
| 46 | * The starting device for each section is offset near_copies from the starting | 47 | * near_copies from the starting device of the previous section. Thus there |
| 47 | * device of the previous section. | 48 | * are (near_copies * far_copies) of each chunk, and each is on a different |
| 48 | * Thus they are (near_copies*far_copies) of each chunk, and each is on a different | 49 | * drive. near_copies and far_copies must be at least one, and their product |
| 49 | * drive. | 50 | * is at most raid_disks. |
| 50 | * near_copies and far_copies must be at least one, and their product is at most | ||
| 51 | * raid_disks. | ||
| 52 | * | 51 | * |
| 53 | * If far_offset is true, then the far_copies are handled a bit differently. | 52 | * If far_offset is true, then the far_copies are handled a bit differently. |
| 54 | * The copies are still in different stripes, but instead of be very far apart | 53 | * The copies are still in different stripes, but instead of being very far |
| 55 | * on disk, there are adjacent stripes. | 54 | * apart on disk, there are adjacent stripes. |
| 55 | * | ||
| 56 | * The far and offset algorithms are handled slightly differently if | ||
| 57 | * 'use_far_sets' is true. In this case, the array's devices are grouped into | ||
| 58 | * sets that are (near_copies * far_copies) in size. The far copied stripes | ||
| 59 | * are still shifted by 'near_copies' devices, but this shifting stays confined | ||
| 60 | * to the set rather than the entire array. This is done to improve the number | ||
| 61 | * of device combinations that can fail without causing the array to fail. | ||
| 62 | * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk | ||
| 63 | * on a device): | ||
| 64 | * A B C D A B C D E | ||
| 65 | * ... ... | ||
| 66 | * D A B C E A B C D | ||
| 67 | * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): | ||
| 68 | * [A B] [C D] [A B] [C D E] | ||
| 69 | * |...| |...| |...| | ... | | ||
| 70 | * [B A] [D C] [B A] [E C D] | ||
| 56 | */ | 71 | */ |
| 57 | 72 | ||
| 58 | /* | 73 | /* |
| @@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) | |||
| 535 | sector_t stripe; | 550 | sector_t stripe; |
| 536 | int dev; | 551 | int dev; |
| 537 | int slot = 0; | 552 | int slot = 0; |
| 553 | int last_far_set_start, last_far_set_size; | ||
| 554 | |||
| 555 | last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; | ||
| 556 | last_far_set_start *= geo->far_set_size; | ||
| 557 | |||
| 558 | last_far_set_size = geo->far_set_size; | ||
| 559 | last_far_set_size += (geo->raid_disks % geo->far_set_size); | ||
| 538 | 560 | ||
| 539 | /* now calculate first sector/dev */ | 561 | /* now calculate first sector/dev */ |
| 540 | chunk = r10bio->sector >> geo->chunk_shift; | 562 | chunk = r10bio->sector >> geo->chunk_shift; |
| @@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) | |||
| 551 | /* and calculate all the others */ | 573 | /* and calculate all the others */ |
| 552 | for (n = 0; n < geo->near_copies; n++) { | 574 | for (n = 0; n < geo->near_copies; n++) { |
| 553 | int d = dev; | 575 | int d = dev; |
| 576 | int set; | ||
| 554 | sector_t s = sector; | 577 | sector_t s = sector; |
| 555 | r10bio->devs[slot].addr = sector; | ||
| 556 | r10bio->devs[slot].devnum = d; | 578 | r10bio->devs[slot].devnum = d; |
| 579 | r10bio->devs[slot].addr = s; | ||
| 557 | slot++; | 580 | slot++; |
| 558 | 581 | ||
| 559 | for (f = 1; f < geo->far_copies; f++) { | 582 | for (f = 1; f < geo->far_copies; f++) { |
| 583 | set = d / geo->far_set_size; | ||
| 560 | d += geo->near_copies; | 584 | d += geo->near_copies; |
| 561 | if (d >= geo->raid_disks) | 585 | |
| 562 | d -= geo->raid_disks; | 586 | if ((geo->raid_disks % geo->far_set_size) && |
| 587 | (d > last_far_set_start)) { | ||
| 588 | d -= last_far_set_start; | ||
| 589 | d %= last_far_set_size; | ||
| 590 | d += last_far_set_start; | ||
| 591 | } else { | ||
| 592 | d %= geo->far_set_size; | ||
| 593 | d += geo->far_set_size * set; | ||
| 594 | } | ||
| 563 | s += geo->stride; | 595 | s += geo->stride; |
| 564 | r10bio->devs[slot].devnum = d; | 596 | r10bio->devs[slot].devnum = d; |
| 565 | r10bio->devs[slot].addr = s; | 597 | r10bio->devs[slot].addr = s; |
| @@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) | |||
| 595 | * or recovery, so reshape isn't happening | 627 | * or recovery, so reshape isn't happening |
| 596 | */ | 628 | */ |
| 597 | struct geom *geo = &conf->geo; | 629 | struct geom *geo = &conf->geo; |
| 630 | int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; | ||
| 631 | int far_set_size = geo->far_set_size; | ||
| 632 | int last_far_set_start; | ||
| 633 | |||
| 634 | if (geo->raid_disks % geo->far_set_size) { | ||
| 635 | last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; | ||
| 636 | last_far_set_start *= geo->far_set_size; | ||
| 637 | |||
| 638 | if (dev >= last_far_set_start) { | ||
| 639 | far_set_size = geo->far_set_size; | ||
| 640 | far_set_size += (geo->raid_disks % geo->far_set_size); | ||
| 641 | far_set_start = last_far_set_start; | ||
| 642 | } | ||
| 643 | } | ||
| 598 | 644 | ||
| 599 | offset = sector & geo->chunk_mask; | 645 | offset = sector & geo->chunk_mask; |
| 600 | if (geo->far_offset) { | 646 | if (geo->far_offset) { |
| @@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) | |||
| 602 | chunk = sector >> geo->chunk_shift; | 648 | chunk = sector >> geo->chunk_shift; |
| 603 | fc = sector_div(chunk, geo->far_copies); | 649 | fc = sector_div(chunk, geo->far_copies); |
| 604 | dev -= fc * geo->near_copies; | 650 | dev -= fc * geo->near_copies; |
| 605 | if (dev < 0) | 651 | if (dev < far_set_start) |
| 606 | dev += geo->raid_disks; | 652 | dev += far_set_size; |
| 607 | } else { | 653 | } else { |
| 608 | while (sector >= geo->stride) { | 654 | while (sector >= geo->stride) { |
| 609 | sector -= geo->stride; | 655 | sector -= geo->stride; |
| 610 | if (dev < geo->near_copies) | 656 | if (dev < (geo->near_copies + far_set_start)) |
| 611 | dev += geo->raid_disks - geo->near_copies; | 657 | dev += far_set_size - geo->near_copies; |
| 612 | else | 658 | else |
| 613 | dev -= geo->near_copies; | 659 | dev -= geo->near_copies; |
| 614 | } | 660 | } |
| @@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) | |||
| 1073 | bio_list_merge(&conf->pending_bio_list, &plug->pending); | 1119 | bio_list_merge(&conf->pending_bio_list, &plug->pending); |
| 1074 | conf->pending_count += plug->pending_cnt; | 1120 | conf->pending_count += plug->pending_cnt; |
| 1075 | spin_unlock_irq(&conf->device_lock); | 1121 | spin_unlock_irq(&conf->device_lock); |
| 1122 | wake_up(&conf->wait_barrier); | ||
| 1076 | md_wakeup_thread(mddev->thread); | 1123 | md_wakeup_thread(mddev->thread); |
| 1077 | kfree(plug); | 1124 | kfree(plug); |
| 1078 | return; | 1125 | return; |
| @@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
| 1105 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); | 1152 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); |
| 1106 | const unsigned long do_discard = (bio->bi_rw | 1153 | const unsigned long do_discard = (bio->bi_rw |
| 1107 | & (REQ_DISCARD | REQ_SECURE)); | 1154 | & (REQ_DISCARD | REQ_SECURE)); |
| 1155 | const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME); | ||
| 1108 | unsigned long flags; | 1156 | unsigned long flags; |
| 1109 | struct md_rdev *blocked_rdev; | 1157 | struct md_rdev *blocked_rdev; |
| 1110 | struct blk_plug_cb *cb; | 1158 | struct blk_plug_cb *cb; |
| @@ -1460,7 +1508,8 @@ retry_write: | |||
| 1460 | rdev)); | 1508 | rdev)); |
| 1461 | mbio->bi_bdev = rdev->bdev; | 1509 | mbio->bi_bdev = rdev->bdev; |
| 1462 | mbio->bi_end_io = raid10_end_write_request; | 1510 | mbio->bi_end_io = raid10_end_write_request; |
| 1463 | mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; | 1511 | mbio->bi_rw = |
| 1512 | WRITE | do_sync | do_fua | do_discard | do_same; | ||
| 1464 | mbio->bi_private = r10_bio; | 1513 | mbio->bi_private = r10_bio; |
| 1465 | 1514 | ||
| 1466 | atomic_inc(&r10_bio->remaining); | 1515 | atomic_inc(&r10_bio->remaining); |
| @@ -1502,7 +1551,8 @@ retry_write: | |||
| 1502 | r10_bio, rdev)); | 1551 | r10_bio, rdev)); |
| 1503 | mbio->bi_bdev = rdev->bdev; | 1552 | mbio->bi_bdev = rdev->bdev; |
| 1504 | mbio->bi_end_io = raid10_end_write_request; | 1553 | mbio->bi_end_io = raid10_end_write_request; |
| 1505 | mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; | 1554 | mbio->bi_rw = |
| 1555 | WRITE | do_sync | do_fua | do_discard | do_same; | ||
| 1506 | mbio->bi_private = r10_bio; | 1556 | mbio->bi_private = r10_bio; |
| 1507 | 1557 | ||
| 1508 | atomic_inc(&r10_bio->remaining); | 1558 | atomic_inc(&r10_bio->remaining); |
| @@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) | |||
| 3436 | disks = mddev->raid_disks + mddev->delta_disks; | 3486 | disks = mddev->raid_disks + mddev->delta_disks; |
| 3437 | break; | 3487 | break; |
| 3438 | } | 3488 | } |
| 3439 | if (layout >> 17) | 3489 | if (layout >> 18) |
| 3440 | return -1; | 3490 | return -1; |
| 3441 | if (chunk < (PAGE_SIZE >> 9) || | 3491 | if (chunk < (PAGE_SIZE >> 9) || |
| 3442 | !is_power_of_2(chunk)) | 3492 | !is_power_of_2(chunk)) |
| @@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) | |||
| 3448 | geo->near_copies = nc; | 3498 | geo->near_copies = nc; |
| 3449 | geo->far_copies = fc; | 3499 | geo->far_copies = fc; |
| 3450 | geo->far_offset = fo; | 3500 | geo->far_offset = fo; |
| 3501 | geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks; | ||
| 3451 | geo->chunk_mask = chunk - 1; | 3502 | geo->chunk_mask = chunk - 1; |
| 3452 | geo->chunk_shift = ffz(~chunk); | 3503 | geo->chunk_shift = ffz(~chunk); |
| 3453 | return nc*fc; | 3504 | return nc*fc; |
| @@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev) | |||
| 3569 | if (mddev->queue) { | 3620 | if (mddev->queue) { |
| 3570 | blk_queue_max_discard_sectors(mddev->queue, | 3621 | blk_queue_max_discard_sectors(mddev->queue, |
| 3571 | mddev->chunk_sectors); | 3622 | mddev->chunk_sectors); |
| 3623 | blk_queue_max_write_same_sectors(mddev->queue, | ||
| 3624 | mddev->chunk_sectors); | ||
| 3572 | blk_queue_io_min(mddev->queue, chunk_size); | 3625 | blk_queue_io_min(mddev->queue, chunk_size); |
| 3573 | if (conf->geo.raid_disks % conf->geo.near_copies) | 3626 | if (conf->geo.raid_disks % conf->geo.near_copies) |
| 3574 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); | 3627 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 1054cf602345..157d69e83ff4 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
| @@ -33,6 +33,11 @@ struct r10conf { | |||
| 33 | * far_offset, in which case it is | 33 | * far_offset, in which case it is |
| 34 | * 1 stripe. | 34 | * 1 stripe. |
| 35 | */ | 35 | */ |
| 36 | int far_set_size; /* The number of devices in a set, | ||
| 37 | * where a 'set' are devices that | ||
| 38 | * contain far/offset copies of | ||
| 39 | * each other. | ||
| 40 | */ | ||
| 36 | int chunk_shift; /* shift from chunks to sectors */ | 41 | int chunk_shift; /* shift from chunks to sectors */ |
| 37 | sector_t chunk_mask; | 42 | sector_t chunk_mask; |
| 38 | } prev, geo; | 43 | } prev, geo; |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 19d77a026639..3ee2912889e7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -184,8 +184,6 @@ static void return_io(struct bio *return_bi) | |||
| 184 | return_bi = bi->bi_next; | 184 | return_bi = bi->bi_next; |
| 185 | bi->bi_next = NULL; | 185 | bi->bi_next = NULL; |
| 186 | bi->bi_size = 0; | 186 | bi->bi_size = 0; |
| 187 | trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), | ||
| 188 | bi, 0); | ||
| 189 | bio_endio(bi, 0); | 187 | bio_endio(bi, 0); |
| 190 | bi = return_bi; | 188 | bi = return_bi; |
| 191 | } | 189 | } |
| @@ -365,10 +363,9 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, | |||
| 365 | short generation) | 363 | short generation) |
| 366 | { | 364 | { |
| 367 | struct stripe_head *sh; | 365 | struct stripe_head *sh; |
| 368 | struct hlist_node *hn; | ||
| 369 | 366 | ||
| 370 | pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); | 367 | pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); |
| 371 | hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) | 368 | hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) |
| 372 | if (sh->sector == sector && sh->generation == generation) | 369 | if (sh->sector == sector && sh->generation == generation) |
| 373 | return sh; | 370 | return sh; |
| 374 | pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); | 371 | pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); |
| @@ -1406,7 +1403,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu | |||
| 1406 | &sh->ops.zero_sum_result, percpu->spare_page, &submit); | 1403 | &sh->ops.zero_sum_result, percpu->spare_page, &submit); |
| 1407 | } | 1404 | } |
| 1408 | 1405 | ||
| 1409 | static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | 1406 | static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) |
| 1410 | { | 1407 | { |
| 1411 | int overlap_clear = 0, i, disks = sh->disks; | 1408 | int overlap_clear = 0, i, disks = sh->disks; |
| 1412 | struct dma_async_tx_descriptor *tx = NULL; | 1409 | struct dma_async_tx_descriptor *tx = NULL; |
| @@ -1471,36 +1468,6 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
| 1471 | put_cpu(); | 1468 | put_cpu(); |
| 1472 | } | 1469 | } |
| 1473 | 1470 | ||
| 1474 | #ifdef CONFIG_MULTICORE_RAID456 | ||
| 1475 | static void async_run_ops(void *param, async_cookie_t cookie) | ||
| 1476 | { | ||
| 1477 | struct stripe_head *sh = param; | ||
| 1478 | unsigned long ops_request = sh->ops.request; | ||
| 1479 | |||
| 1480 | clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); | ||
| 1481 | wake_up(&sh->ops.wait_for_ops); | ||
| 1482 | |||
| 1483 | __raid_run_ops(sh, ops_request); | ||
| 1484 | release_stripe(sh); | ||
| 1485 | } | ||
| 1486 | |||
| 1487 | static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | ||
| 1488 | { | ||
| 1489 | /* since handle_stripe can be called outside of raid5d context | ||
| 1490 | * we need to ensure sh->ops.request is de-staged before another | ||
| 1491 | * request arrives | ||
| 1492 | */ | ||
| 1493 | wait_event(sh->ops.wait_for_ops, | ||
| 1494 | !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); | ||
| 1495 | sh->ops.request = ops_request; | ||
| 1496 | |||
| 1497 | atomic_inc(&sh->count); | ||
| 1498 | async_schedule(async_run_ops, sh); | ||
| 1499 | } | ||
| 1500 | #else | ||
| 1501 | #define raid_run_ops __raid_run_ops | ||
| 1502 | #endif | ||
| 1503 | |||
| 1504 | static int grow_one_stripe(struct r5conf *conf) | 1471 | static int grow_one_stripe(struct r5conf *conf) |
| 1505 | { | 1472 | { |
| 1506 | struct stripe_head *sh; | 1473 | struct stripe_head *sh; |
| @@ -1509,9 +1476,6 @@ static int grow_one_stripe(struct r5conf *conf) | |||
| 1509 | return 0; | 1476 | return 0; |
| 1510 | 1477 | ||
| 1511 | sh->raid_conf = conf; | 1478 | sh->raid_conf = conf; |
| 1512 | #ifdef CONFIG_MULTICORE_RAID456 | ||
| 1513 | init_waitqueue_head(&sh->ops.wait_for_ops); | ||
| 1514 | #endif | ||
| 1515 | 1479 | ||
| 1516 | spin_lock_init(&sh->stripe_lock); | 1480 | spin_lock_init(&sh->stripe_lock); |
| 1517 | 1481 | ||
| @@ -1630,9 +1594,6 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
| 1630 | break; | 1594 | break; |
| 1631 | 1595 | ||
| 1632 | nsh->raid_conf = conf; | 1596 | nsh->raid_conf = conf; |
| 1633 | #ifdef CONFIG_MULTICORE_RAID456 | ||
| 1634 | init_waitqueue_head(&nsh->ops.wait_for_ops); | ||
| 1635 | #endif | ||
| 1636 | spin_lock_init(&nsh->stripe_lock); | 1597 | spin_lock_init(&nsh->stripe_lock); |
| 1637 | 1598 | ||
| 1638 | list_add(&nsh->lru, &newstripes); | 1599 | list_add(&nsh->lru, &newstripes); |
| @@ -3917,8 +3878,6 @@ static void raid5_align_endio(struct bio *bi, int error) | |||
| 3917 | rdev_dec_pending(rdev, conf->mddev); | 3878 | rdev_dec_pending(rdev, conf->mddev); |
| 3918 | 3879 | ||
| 3919 | if (!error && uptodate) { | 3880 | if (!error && uptodate) { |
| 3920 | trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), | ||
| 3921 | raid_bi, 0); | ||
| 3922 | bio_endio(raid_bi, 0); | 3881 | bio_endio(raid_bi, 0); |
| 3923 | if (atomic_dec_and_test(&conf->active_aligned_reads)) | 3882 | if (atomic_dec_and_test(&conf->active_aligned_reads)) |
| 3924 | wake_up(&conf->wait_for_stripe); | 3883 | wake_up(&conf->wait_for_stripe); |
| @@ -4377,8 +4336,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 4377 | if ( rw == WRITE ) | 4336 | if ( rw == WRITE ) |
| 4378 | md_write_end(mddev); | 4337 | md_write_end(mddev); |
| 4379 | 4338 | ||
| 4380 | trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), | ||
| 4381 | bi, 0); | ||
| 4382 | bio_endio(bi, 0); | 4339 | bio_endio(bi, 0); |
| 4383 | } | 4340 | } |
| 4384 | } | 4341 | } |
| @@ -4755,11 +4712,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
| 4755 | handled++; | 4712 | handled++; |
| 4756 | } | 4713 | } |
| 4757 | remaining = raid5_dec_bi_active_stripes(raid_bio); | 4714 | remaining = raid5_dec_bi_active_stripes(raid_bio); |
| 4758 | if (remaining == 0) { | 4715 | if (remaining == 0) |
| 4759 | trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), | ||
| 4760 | raid_bio, 0); | ||
| 4761 | bio_endio(raid_bio, 0); | 4716 | bio_endio(raid_bio, 0); |
| 4762 | } | ||
| 4763 | if (atomic_dec_and_test(&conf->active_aligned_reads)) | 4717 | if (atomic_dec_and_test(&conf->active_aligned_reads)) |
| 4764 | wake_up(&conf->wait_for_stripe); | 4718 | wake_up(&conf->wait_for_stripe); |
| 4765 | return handled; | 4719 | return handled; |
