diff options
Diffstat (limited to 'drivers/md')
| -rw-r--r-- | drivers/md/Kconfig | 22 | ||||
| -rw-r--r-- | drivers/md/dm-cache-metadata.c | 104 | ||||
| -rw-r--r-- | drivers/md/dm-cache-metadata.h | 5 | ||||
| -rw-r--r-- | drivers/md/dm-cache-policy-internal.h | 7 | ||||
| -rw-r--r-- | drivers/md/dm-cache-policy-mq.c | 681 | ||||
| -rw-r--r-- | drivers/md/dm-cache-policy.c | 4 | ||||
| -rw-r--r-- | drivers/md/dm-cache-policy.h | 21 | ||||
| -rw-r--r-- | drivers/md/dm-cache-target.c | 687 | ||||
| -rw-r--r-- | drivers/md/dm-crypt.c | 214 | ||||
| -rw-r--r-- | drivers/md/dm-ioctl.c | 36 | ||||
| -rw-r--r-- | drivers/md/dm-mpath.c | 34 | ||||
| -rw-r--r-- | drivers/md/dm-table.c | 23 | ||||
| -rw-r--r-- | drivers/md/dm.c | 47 | ||||
| -rw-r--r-- | drivers/md/dm.h | 13 | ||||
| -rw-r--r-- | drivers/md/persistent-data/dm-array.c | 5 | ||||
| -rw-r--r-- | drivers/md/persistent-data/dm-space-map-disk.c | 18 |
16 files changed, 1466 insertions, 455 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 30b426ed744b..f2ccbc3b9fe4 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
| @@ -297,6 +297,17 @@ config DM_MIRROR | |||
| 297 | Allow volume managers to mirror logical volumes, also | 297 | Allow volume managers to mirror logical volumes, also |
| 298 | needed for live data migration tools such as 'pvmove'. | 298 | needed for live data migration tools such as 'pvmove'. |
| 299 | 299 | ||
| 300 | config DM_LOG_USERSPACE | ||
| 301 | tristate "Mirror userspace logging" | ||
| 302 | depends on DM_MIRROR && NET | ||
| 303 | select CONNECTOR | ||
| 304 | ---help--- | ||
| 305 | The userspace logging module provides a mechanism for | ||
| 306 | relaying the dm-dirty-log API to userspace. Log designs | ||
| 307 | which are more suited to userspace implementation (e.g. | ||
| 308 | shared storage logs) or experimental logs can be implemented | ||
| 309 | by leveraging this framework. | ||
| 310 | |||
| 300 | config DM_RAID | 311 | config DM_RAID |
| 301 | tristate "RAID 1/4/5/6/10 target" | 312 | tristate "RAID 1/4/5/6/10 target" |
| 302 | depends on BLK_DEV_DM | 313 | depends on BLK_DEV_DM |
| @@ -323,17 +334,6 @@ config DM_RAID | |||
| 323 | RAID-5, RAID-6 distributes the syndromes across the drives | 334 | RAID-5, RAID-6 distributes the syndromes across the drives |
| 324 | in one of the available parity distribution methods. | 335 | in one of the available parity distribution methods. |
| 325 | 336 | ||
| 326 | config DM_LOG_USERSPACE | ||
| 327 | tristate "Mirror userspace logging" | ||
| 328 | depends on DM_MIRROR && NET | ||
| 329 | select CONNECTOR | ||
| 330 | ---help--- | ||
| 331 | The userspace logging module provides a mechanism for | ||
| 332 | relaying the dm-dirty-log API to userspace. Log designs | ||
| 333 | which are more suited to userspace implementation (e.g. | ||
| 334 | shared storage logs) or experimental logs can be implemented | ||
| 335 | by leveraging this framework. | ||
| 336 | |||
| 337 | config DM_ZERO | 337 | config DM_ZERO |
| 338 | tristate "Zero target" | 338 | tristate "Zero target" |
| 339 | depends on BLK_DEV_DM | 339 | depends on BLK_DEV_DM |
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 1af7255bbffb..9ef0752e8a08 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c | |||
| @@ -20,7 +20,13 @@ | |||
| 20 | 20 | ||
| 21 | #define CACHE_SUPERBLOCK_MAGIC 06142003 | 21 | #define CACHE_SUPERBLOCK_MAGIC 06142003 |
| 22 | #define CACHE_SUPERBLOCK_LOCATION 0 | 22 | #define CACHE_SUPERBLOCK_LOCATION 0 |
| 23 | #define CACHE_VERSION 1 | 23 | |
| 24 | /* | ||
| 25 | * defines a range of metadata versions that this module can handle. | ||
| 26 | */ | ||
| 27 | #define MIN_CACHE_VERSION 1 | ||
| 28 | #define MAX_CACHE_VERSION 1 | ||
| 29 | |||
| 24 | #define CACHE_METADATA_CACHE_SIZE 64 | 30 | #define CACHE_METADATA_CACHE_SIZE 64 |
| 25 | 31 | ||
| 26 | /* | 32 | /* |
| @@ -134,6 +140,18 @@ static void sb_prepare_for_write(struct dm_block_validator *v, | |||
| 134 | SUPERBLOCK_CSUM_XOR)); | 140 | SUPERBLOCK_CSUM_XOR)); |
| 135 | } | 141 | } |
| 136 | 142 | ||
| 143 | static int check_metadata_version(struct cache_disk_superblock *disk_super) | ||
| 144 | { | ||
| 145 | uint32_t metadata_version = le32_to_cpu(disk_super->version); | ||
| 146 | if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) { | ||
| 147 | DMERR("Cache metadata version %u found, but only versions between %u and %u supported.", | ||
| 148 | metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION); | ||
| 149 | return -EINVAL; | ||
| 150 | } | ||
| 151 | |||
| 152 | return 0; | ||
| 153 | } | ||
| 154 | |||
| 137 | static int sb_check(struct dm_block_validator *v, | 155 | static int sb_check(struct dm_block_validator *v, |
| 138 | struct dm_block *b, | 156 | struct dm_block *b, |
| 139 | size_t sb_block_size) | 157 | size_t sb_block_size) |
| @@ -164,7 +182,7 @@ static int sb_check(struct dm_block_validator *v, | |||
| 164 | return -EILSEQ; | 182 | return -EILSEQ; |
| 165 | } | 183 | } |
| 166 | 184 | ||
| 167 | return 0; | 185 | return check_metadata_version(disk_super); |
| 168 | } | 186 | } |
| 169 | 187 | ||
| 170 | static struct dm_block_validator sb_validator = { | 188 | static struct dm_block_validator sb_validator = { |
| @@ -198,7 +216,7 @@ static int superblock_lock(struct dm_cache_metadata *cmd, | |||
| 198 | 216 | ||
| 199 | /*----------------------------------------------------------------*/ | 217 | /*----------------------------------------------------------------*/ |
| 200 | 218 | ||
| 201 | static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) | 219 | static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result) |
| 202 | { | 220 | { |
| 203 | int r; | 221 | int r; |
| 204 | unsigned i; | 222 | unsigned i; |
| @@ -214,10 +232,10 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) | |||
| 214 | return r; | 232 | return r; |
| 215 | 233 | ||
| 216 | data_le = dm_block_data(b); | 234 | data_le = dm_block_data(b); |
| 217 | *result = 1; | 235 | *result = true; |
| 218 | for (i = 0; i < sb_block_size; i++) { | 236 | for (i = 0; i < sb_block_size; i++) { |
| 219 | if (data_le[i] != zero) { | 237 | if (data_le[i] != zero) { |
| 220 | *result = 0; | 238 | *result = false; |
| 221 | break; | 239 | break; |
| 222 | } | 240 | } |
| 223 | } | 241 | } |
| @@ -270,7 +288,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) | |||
| 270 | disk_super->flags = 0; | 288 | disk_super->flags = 0; |
| 271 | memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); | 289 | memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); |
| 272 | disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); | 290 | disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); |
| 273 | disk_super->version = cpu_to_le32(CACHE_VERSION); | 291 | disk_super->version = cpu_to_le32(MAX_CACHE_VERSION); |
| 274 | memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); | 292 | memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); |
| 275 | memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); | 293 | memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); |
| 276 | disk_super->policy_hint_size = 0; | 294 | disk_super->policy_hint_size = 0; |
| @@ -411,7 +429,8 @@ bad: | |||
| 411 | static int __open_or_format_metadata(struct dm_cache_metadata *cmd, | 429 | static int __open_or_format_metadata(struct dm_cache_metadata *cmd, |
| 412 | bool format_device) | 430 | bool format_device) |
| 413 | { | 431 | { |
| 414 | int r, unformatted; | 432 | int r; |
| 433 | bool unformatted = false; | ||
| 415 | 434 | ||
| 416 | r = __superblock_all_zeroes(cmd->bm, &unformatted); | 435 | r = __superblock_all_zeroes(cmd->bm, &unformatted); |
| 417 | if (r) | 436 | if (r) |
| @@ -666,19 +685,85 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd) | |||
| 666 | kfree(cmd); | 685 | kfree(cmd); |
| 667 | } | 686 | } |
| 668 | 687 | ||
| 688 | /* | ||
| 689 | * Checks that the given cache block is either unmapped or clean. | ||
| 690 | */ | ||
| 691 | static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b, | ||
| 692 | bool *result) | ||
| 693 | { | ||
| 694 | int r; | ||
| 695 | __le64 value; | ||
| 696 | dm_oblock_t ob; | ||
| 697 | unsigned flags; | ||
| 698 | |||
| 699 | r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value); | ||
| 700 | if (r) { | ||
| 701 | DMERR("block_unmapped_or_clean failed"); | ||
| 702 | return r; | ||
| 703 | } | ||
| 704 | |||
| 705 | unpack_value(value, &ob, &flags); | ||
| 706 | *result = !((flags & M_VALID) && (flags & M_DIRTY)); | ||
| 707 | |||
| 708 | return 0; | ||
| 709 | } | ||
| 710 | |||
| 711 | static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, | ||
| 712 | dm_cblock_t begin, dm_cblock_t end, | ||
| 713 | bool *result) | ||
| 714 | { | ||
| 715 | int r; | ||
| 716 | *result = true; | ||
| 717 | |||
| 718 | while (begin != end) { | ||
| 719 | r = block_unmapped_or_clean(cmd, begin, result); | ||
| 720 | if (r) | ||
| 721 | return r; | ||
| 722 | |||
| 723 | if (!*result) { | ||
| 724 | DMERR("cache block %llu is dirty", | ||
| 725 | (unsigned long long) from_cblock(begin)); | ||
| 726 | return 0; | ||
| 727 | } | ||
| 728 | |||
| 729 | begin = to_cblock(from_cblock(begin) + 1); | ||
| 730 | } | ||
| 731 | |||
| 732 | return 0; | ||
| 733 | } | ||
| 734 | |||
| 669 | int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) | 735 | int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) |
| 670 | { | 736 | { |
| 671 | int r; | 737 | int r; |
| 738 | bool clean; | ||
| 672 | __le64 null_mapping = pack_value(0, 0); | 739 | __le64 null_mapping = pack_value(0, 0); |
| 673 | 740 | ||
| 674 | down_write(&cmd->root_lock); | 741 | down_write(&cmd->root_lock); |
| 675 | __dm_bless_for_disk(&null_mapping); | 742 | __dm_bless_for_disk(&null_mapping); |
| 743 | |||
| 744 | if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) { | ||
| 745 | r = blocks_are_unmapped_or_clean(cmd, new_cache_size, cmd->cache_blocks, &clean); | ||
| 746 | if (r) { | ||
| 747 | __dm_unbless_for_disk(&null_mapping); | ||
| 748 | goto out; | ||
| 749 | } | ||
| 750 | |||
| 751 | if (!clean) { | ||
| 752 | DMERR("unable to shrink cache due to dirty blocks"); | ||
| 753 | r = -EINVAL; | ||
| 754 | __dm_unbless_for_disk(&null_mapping); | ||
| 755 | goto out; | ||
| 756 | } | ||
| 757 | } | ||
| 758 | |||
| 676 | r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks), | 759 | r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks), |
| 677 | from_cblock(new_cache_size), | 760 | from_cblock(new_cache_size), |
| 678 | &null_mapping, &cmd->root); | 761 | &null_mapping, &cmd->root); |
| 679 | if (!r) | 762 | if (!r) |
| 680 | cmd->cache_blocks = new_cache_size; | 763 | cmd->cache_blocks = new_cache_size; |
| 681 | cmd->changed = true; | 764 | cmd->changed = true; |
| 765 | |||
| 766 | out: | ||
| 682 | up_write(&cmd->root_lock); | 767 | up_write(&cmd->root_lock); |
| 683 | 768 | ||
| 684 | return r; | 769 | return r; |
| @@ -1182,3 +1267,8 @@ int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock, | |||
| 1182 | 1267 | ||
| 1183 | return r; | 1268 | return r; |
| 1184 | } | 1269 | } |
| 1270 | |||
| 1271 | int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result) | ||
| 1272 | { | ||
| 1273 | return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result); | ||
| 1274 | } | ||
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index f45cef21f3d0..cd906f14f98d 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h | |||
| @@ -137,6 +137,11 @@ int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy * | |||
| 137 | int dm_cache_save_hint(struct dm_cache_metadata *cmd, | 137 | int dm_cache_save_hint(struct dm_cache_metadata *cmd, |
| 138 | dm_cblock_t cblock, uint32_t hint); | 138 | dm_cblock_t cblock, uint32_t hint); |
| 139 | 139 | ||
| 140 | /* | ||
| 141 | * Query method. Are all the blocks in the cache clean? | ||
| 142 | */ | ||
| 143 | int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result); | ||
| 144 | |||
| 140 | /*----------------------------------------------------------------*/ | 145 | /*----------------------------------------------------------------*/ |
| 141 | 146 | ||
| 142 | #endif /* DM_CACHE_METADATA_H */ | 147 | #endif /* DM_CACHE_METADATA_H */ |
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h index 0928abdc49f0..2256a1f24f73 100644 --- a/drivers/md/dm-cache-policy-internal.h +++ b/drivers/md/dm-cache-policy-internal.h | |||
| @@ -61,7 +61,12 @@ static inline int policy_writeback_work(struct dm_cache_policy *p, | |||
| 61 | 61 | ||
| 62 | static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) | 62 | static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) |
| 63 | { | 63 | { |
| 64 | return p->remove_mapping(p, oblock); | 64 | p->remove_mapping(p, oblock); |
| 65 | } | ||
| 66 | |||
| 67 | static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) | ||
| 68 | { | ||
| 69 | return p->remove_cblock(p, cblock); | ||
| 65 | } | 70 | } |
| 66 | 71 | ||
| 67 | static inline void policy_force_mapping(struct dm_cache_policy *p, | 72 | static inline void policy_force_mapping(struct dm_cache_policy *p, |
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 4296155090b2..416b7b752a6e 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c | |||
| @@ -26,19 +26,6 @@ static unsigned next_power(unsigned n, unsigned min) | |||
| 26 | 26 | ||
| 27 | /*----------------------------------------------------------------*/ | 27 | /*----------------------------------------------------------------*/ |
| 28 | 28 | ||
| 29 | static unsigned long *alloc_bitset(unsigned nr_entries) | ||
| 30 | { | ||
| 31 | size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); | ||
| 32 | return vzalloc(s); | ||
| 33 | } | ||
| 34 | |||
| 35 | static void free_bitset(unsigned long *bits) | ||
| 36 | { | ||
| 37 | vfree(bits); | ||
| 38 | } | ||
| 39 | |||
| 40 | /*----------------------------------------------------------------*/ | ||
| 41 | |||
| 42 | /* | 29 | /* |
| 43 | * Large, sequential ios are probably better left on the origin device since | 30 | * Large, sequential ios are probably better left on the origin device since |
| 44 | * spindles tend to have good bandwidth. | 31 | * spindles tend to have good bandwidth. |
| @@ -151,6 +138,21 @@ static void queue_init(struct queue *q) | |||
| 151 | } | 138 | } |
| 152 | 139 | ||
| 153 | /* | 140 | /* |
| 141 | * Checks to see if the queue is empty. | ||
| 142 | * FIXME: reduce cpu usage. | ||
| 143 | */ | ||
| 144 | static bool queue_empty(struct queue *q) | ||
| 145 | { | ||
| 146 | unsigned i; | ||
| 147 | |||
| 148 | for (i = 0; i < NR_QUEUE_LEVELS; i++) | ||
| 149 | if (!list_empty(q->qs + i)) | ||
| 150 | return false; | ||
| 151 | |||
| 152 | return true; | ||
| 153 | } | ||
| 154 | |||
| 155 | /* | ||
| 154 | * Insert an entry to the back of the given level. | 156 | * Insert an entry to the back of the given level. |
| 155 | */ | 157 | */ |
| 156 | static void queue_push(struct queue *q, unsigned level, struct list_head *elt) | 158 | static void queue_push(struct queue *q, unsigned level, struct list_head *elt) |
| @@ -218,17 +220,116 @@ struct entry { | |||
| 218 | struct hlist_node hlist; | 220 | struct hlist_node hlist; |
| 219 | struct list_head list; | 221 | struct list_head list; |
| 220 | dm_oblock_t oblock; | 222 | dm_oblock_t oblock; |
| 221 | dm_cblock_t cblock; /* valid iff in_cache */ | ||
| 222 | 223 | ||
| 223 | /* | 224 | /* |
| 224 | * FIXME: pack these better | 225 | * FIXME: pack these better |
| 225 | */ | 226 | */ |
| 226 | bool in_cache:1; | 227 | bool dirty:1; |
| 227 | unsigned hit_count; | 228 | unsigned hit_count; |
| 228 | unsigned generation; | 229 | unsigned generation; |
| 229 | unsigned tick; | 230 | unsigned tick; |
| 230 | }; | 231 | }; |
| 231 | 232 | ||
| 233 | /* | ||
| 234 | * Rather than storing the cblock in an entry, we allocate all entries in | ||
| 235 | * an array, and infer the cblock from the entry position. | ||
| 236 | * | ||
| 237 | * Free entries are linked together into a list. | ||
| 238 | */ | ||
| 239 | struct entry_pool { | ||
| 240 | struct entry *entries, *entries_end; | ||
| 241 | struct list_head free; | ||
| 242 | unsigned nr_allocated; | ||
| 243 | }; | ||
| 244 | |||
| 245 | static int epool_init(struct entry_pool *ep, unsigned nr_entries) | ||
| 246 | { | ||
| 247 | unsigned i; | ||
| 248 | |||
| 249 | ep->entries = vzalloc(sizeof(struct entry) * nr_entries); | ||
| 250 | if (!ep->entries) | ||
| 251 | return -ENOMEM; | ||
| 252 | |||
| 253 | ep->entries_end = ep->entries + nr_entries; | ||
| 254 | |||
| 255 | INIT_LIST_HEAD(&ep->free); | ||
| 256 | for (i = 0; i < nr_entries; i++) | ||
| 257 | list_add(&ep->entries[i].list, &ep->free); | ||
| 258 | |||
| 259 | ep->nr_allocated = 0; | ||
| 260 | |||
| 261 | return 0; | ||
| 262 | } | ||
| 263 | |||
| 264 | static void epool_exit(struct entry_pool *ep) | ||
| 265 | { | ||
| 266 | vfree(ep->entries); | ||
| 267 | } | ||
| 268 | |||
| 269 | static struct entry *alloc_entry(struct entry_pool *ep) | ||
| 270 | { | ||
| 271 | struct entry *e; | ||
| 272 | |||
| 273 | if (list_empty(&ep->free)) | ||
| 274 | return NULL; | ||
| 275 | |||
| 276 | e = list_entry(list_pop(&ep->free), struct entry, list); | ||
| 277 | INIT_LIST_HEAD(&e->list); | ||
| 278 | INIT_HLIST_NODE(&e->hlist); | ||
| 279 | ep->nr_allocated++; | ||
| 280 | |||
| 281 | return e; | ||
| 282 | } | ||
| 283 | |||
| 284 | /* | ||
| 285 | * This assumes the cblock hasn't already been allocated. | ||
| 286 | */ | ||
| 287 | static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) | ||
| 288 | { | ||
| 289 | struct entry *e = ep->entries + from_cblock(cblock); | ||
| 290 | list_del(&e->list); | ||
| 291 | |||
| 292 | INIT_LIST_HEAD(&e->list); | ||
| 293 | INIT_HLIST_NODE(&e->hlist); | ||
| 294 | ep->nr_allocated++; | ||
| 295 | |||
| 296 | return e; | ||
| 297 | } | ||
| 298 | |||
| 299 | static void free_entry(struct entry_pool *ep, struct entry *e) | ||
| 300 | { | ||
| 301 | BUG_ON(!ep->nr_allocated); | ||
| 302 | ep->nr_allocated--; | ||
| 303 | INIT_HLIST_NODE(&e->hlist); | ||
| 304 | list_add(&e->list, &ep->free); | ||
| 305 | } | ||
| 306 | |||
| 307 | /* | ||
| 308 | * Returns NULL if the entry is free. | ||
| 309 | */ | ||
| 310 | static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock) | ||
| 311 | { | ||
| 312 | struct entry *e = ep->entries + from_cblock(cblock); | ||
| 313 | return !hlist_unhashed(&e->hlist) ? e : NULL; | ||
| 314 | } | ||
| 315 | |||
| 316 | static bool epool_empty(struct entry_pool *ep) | ||
| 317 | { | ||
| 318 | return list_empty(&ep->free); | ||
| 319 | } | ||
| 320 | |||
| 321 | static bool in_pool(struct entry_pool *ep, struct entry *e) | ||
| 322 | { | ||
| 323 | return e >= ep->entries && e < ep->entries_end; | ||
| 324 | } | ||
| 325 | |||
| 326 | static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e) | ||
| 327 | { | ||
| 328 | return to_cblock(e - ep->entries); | ||
| 329 | } | ||
| 330 | |||
| 331 | /*----------------------------------------------------------------*/ | ||
| 332 | |||
| 232 | struct mq_policy { | 333 | struct mq_policy { |
| 233 | struct dm_cache_policy policy; | 334 | struct dm_cache_policy policy; |
| 234 | 335 | ||
| @@ -238,13 +339,22 @@ struct mq_policy { | |||
| 238 | struct io_tracker tracker; | 339 | struct io_tracker tracker; |
| 239 | 340 | ||
| 240 | /* | 341 | /* |
| 241 | * We maintain two queues of entries. The cache proper contains | 342 | * Entries come from two pools, one of pre-cache entries, and one |
| 242 | * the currently active mappings. Whereas the pre_cache tracks | 343 | * for the cache proper. |
| 243 | * blocks that are being hit frequently and potential candidates | 344 | */ |
| 244 | * for promotion to the cache. | 345 | struct entry_pool pre_cache_pool; |
| 346 | struct entry_pool cache_pool; | ||
| 347 | |||
| 348 | /* | ||
| 349 | * We maintain three queues of entries. The cache proper, | ||
| 350 | * consisting of a clean and dirty queue, contains the currently | ||
| 351 | * active mappings. Whereas the pre_cache tracks blocks that | ||
| 352 | * are being hit frequently and potential candidates for promotion | ||
| 353 | * to the cache. | ||
| 245 | */ | 354 | */ |
| 246 | struct queue pre_cache; | 355 | struct queue pre_cache; |
| 247 | struct queue cache; | 356 | struct queue cache_clean; |
| 357 | struct queue cache_dirty; | ||
| 248 | 358 | ||
| 249 | /* | 359 | /* |
| 250 | * Keeps track of time, incremented by the core. We use this to | 360 | * Keeps track of time, incremented by the core. We use this to |
| @@ -282,25 +392,6 @@ struct mq_policy { | |||
| 282 | unsigned promote_threshold; | 392 | unsigned promote_threshold; |
| 283 | 393 | ||
| 284 | /* | 394 | /* |
| 285 | * We need cache_size entries for the cache, and choose to have | ||
| 286 | * cache_size entries for the pre_cache too. One motivation for | ||
| 287 | * using the same size is to make the hit counts directly | ||
| 288 | * comparable between pre_cache and cache. | ||
| 289 | */ | ||
| 290 | unsigned nr_entries; | ||
| 291 | unsigned nr_entries_allocated; | ||
| 292 | struct list_head free; | ||
| 293 | |||
| 294 | /* | ||
| 295 | * Cache blocks may be unallocated. We store this info in a | ||
| 296 | * bitset. | ||
| 297 | */ | ||
| 298 | unsigned long *allocation_bitset; | ||
| 299 | unsigned nr_cblocks_allocated; | ||
| 300 | unsigned find_free_nr_words; | ||
| 301 | unsigned find_free_last_word; | ||
| 302 | |||
| 303 | /* | ||
| 304 | * The hash table allows us to quickly find an entry by origin | 395 | * The hash table allows us to quickly find an entry by origin |
| 305 | * block. Both pre_cache and cache entries are in here. | 396 | * block. Both pre_cache and cache entries are in here. |
| 306 | */ | 397 | */ |
| @@ -310,49 +401,6 @@ struct mq_policy { | |||
| 310 | }; | 401 | }; |
| 311 | 402 | ||
| 312 | /*----------------------------------------------------------------*/ | 403 | /*----------------------------------------------------------------*/ |
| 313 | /* Free/alloc mq cache entry structures. */ | ||
| 314 | static void takeout_queue(struct list_head *lh, struct queue *q) | ||
| 315 | { | ||
| 316 | unsigned level; | ||
| 317 | |||
| 318 | for (level = 0; level < NR_QUEUE_LEVELS; level++) | ||
| 319 | list_splice(q->qs + level, lh); | ||
| 320 | } | ||
| 321 | |||
| 322 | static void free_entries(struct mq_policy *mq) | ||
| 323 | { | ||
| 324 | struct entry *e, *tmp; | ||
| 325 | |||
| 326 | takeout_queue(&mq->free, &mq->pre_cache); | ||
| 327 | takeout_queue(&mq->free, &mq->cache); | ||
| 328 | |||
| 329 | list_for_each_entry_safe(e, tmp, &mq->free, list) | ||
| 330 | kmem_cache_free(mq_entry_cache, e); | ||
| 331 | } | ||
| 332 | |||
| 333 | static int alloc_entries(struct mq_policy *mq, unsigned elts) | ||
| 334 | { | ||
| 335 | unsigned u = mq->nr_entries; | ||
| 336 | |||
| 337 | INIT_LIST_HEAD(&mq->free); | ||
| 338 | mq->nr_entries_allocated = 0; | ||
| 339 | |||
| 340 | while (u--) { | ||
| 341 | struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL); | ||
| 342 | |||
| 343 | if (!e) { | ||
| 344 | free_entries(mq); | ||
| 345 | return -ENOMEM; | ||
| 346 | } | ||
| 347 | |||
| 348 | |||
| 349 | list_add(&e->list, &mq->free); | ||
| 350 | } | ||
| 351 | |||
| 352 | return 0; | ||
| 353 | } | ||
| 354 | |||
| 355 | /*----------------------------------------------------------------*/ | ||
| 356 | 404 | ||
| 357 | /* | 405 | /* |
| 358 | * Simple hash table implementation. Should replace with the standard hash | 406 | * Simple hash table implementation. Should replace with the standard hash |
| @@ -388,96 +436,14 @@ static void hash_remove(struct entry *e) | |||
| 388 | 436 | ||
| 389 | /*----------------------------------------------------------------*/ | 437 | /*----------------------------------------------------------------*/ |
| 390 | 438 | ||
| 391 | /* | ||
| 392 | * Allocates a new entry structure. The memory is allocated in one lump, | ||
| 393 | * so we just handing it out here. Returns NULL if all entries have | ||
| 394 | * already been allocated. Cannot fail otherwise. | ||
| 395 | */ | ||
| 396 | static struct entry *alloc_entry(struct mq_policy *mq) | ||
| 397 | { | ||
| 398 | struct entry *e; | ||
| 399 | |||
| 400 | if (mq->nr_entries_allocated >= mq->nr_entries) { | ||
| 401 | BUG_ON(!list_empty(&mq->free)); | ||
| 402 | return NULL; | ||
| 403 | } | ||
| 404 | |||
| 405 | e = list_entry(list_pop(&mq->free), struct entry, list); | ||
| 406 | INIT_LIST_HEAD(&e->list); | ||
| 407 | INIT_HLIST_NODE(&e->hlist); | ||
| 408 | |||
| 409 | mq->nr_entries_allocated++; | ||
| 410 | return e; | ||
| 411 | } | ||
| 412 | |||
| 413 | /*----------------------------------------------------------------*/ | ||
| 414 | |||
| 415 | /* | ||
| 416 | * Mark cache blocks allocated or not in the bitset. | ||
| 417 | */ | ||
| 418 | static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock) | ||
| 419 | { | ||
| 420 | BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size)); | ||
| 421 | BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset)); | ||
| 422 | |||
| 423 | set_bit(from_cblock(cblock), mq->allocation_bitset); | ||
| 424 | mq->nr_cblocks_allocated++; | ||
| 425 | } | ||
| 426 | |||
| 427 | static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock) | ||
| 428 | { | ||
| 429 | BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size)); | ||
| 430 | BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset)); | ||
| 431 | |||
| 432 | clear_bit(from_cblock(cblock), mq->allocation_bitset); | ||
| 433 | mq->nr_cblocks_allocated--; | ||
| 434 | } | ||
| 435 | |||
| 436 | static bool any_free_cblocks(struct mq_policy *mq) | 439 | static bool any_free_cblocks(struct mq_policy *mq) |
| 437 | { | 440 | { |
| 438 | return mq->nr_cblocks_allocated < from_cblock(mq->cache_size); | 441 | return !epool_empty(&mq->cache_pool); |
| 439 | } | 442 | } |
| 440 | 443 | ||
| 441 | /* | 444 | static bool any_clean_cblocks(struct mq_policy *mq) |
| 442 | * Fills result out with a cache block that isn't in use, or return | ||
| 443 | * -ENOSPC. This does _not_ mark the cblock as allocated, the caller is | ||
| 444 | * reponsible for that. | ||
| 445 | */ | ||
| 446 | static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end, | ||
| 447 | dm_cblock_t *result, unsigned *last_word) | ||
| 448 | { | 445 | { |
| 449 | int r = -ENOSPC; | 446 | return !queue_empty(&mq->cache_clean); |
| 450 | unsigned w; | ||
| 451 | |||
| 452 | for (w = begin; w < end; w++) { | ||
| 453 | /* | ||
| 454 | * ffz is undefined if no zero exists | ||
| 455 | */ | ||
| 456 | if (mq->allocation_bitset[w] != ~0UL) { | ||
| 457 | *last_word = w; | ||
| 458 | *result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w])); | ||
| 459 | if (from_cblock(*result) < from_cblock(mq->cache_size)) | ||
| 460 | r = 0; | ||
| 461 | |||
| 462 | break; | ||
| 463 | } | ||
| 464 | } | ||
| 465 | |||
| 466 | return r; | ||
| 467 | } | ||
| 468 | |||
| 469 | static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result) | ||
| 470 | { | ||
| 471 | int r; | ||
| 472 | |||
| 473 | if (!any_free_cblocks(mq)) | ||
| 474 | return -ENOSPC; | ||
| 475 | |||
| 476 | r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word); | ||
| 477 | if (r == -ENOSPC && mq->find_free_last_word) | ||
| 478 | r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word); | ||
| 479 | |||
| 480 | return r; | ||
| 481 | } | 447 | } |
| 482 | 448 | ||
| 483 | /*----------------------------------------------------------------*/ | 449 | /*----------------------------------------------------------------*/ |
| @@ -496,33 +462,35 @@ static unsigned queue_level(struct entry *e) | |||
| 496 | return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); | 462 | return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); |
| 497 | } | 463 | } |
| 498 | 464 | ||
| 465 | static bool in_cache(struct mq_policy *mq, struct entry *e) | ||
| 466 | { | ||
| 467 | return in_pool(&mq->cache_pool, e); | ||
| 468 | } | ||
| 469 | |||
| 499 | /* | 470 | /* |
| 500 | * Inserts the entry into the pre_cache or the cache. Ensures the cache | 471 | * Inserts the entry into the pre_cache or the cache. Ensures the cache |
| 501 | * block is marked as allocated if necc. Inserts into the hash table. Sets the | 472 | * block is marked as allocated if necc. Inserts into the hash table. |
| 502 | * tick which records when the entry was last moved about. | 473 | * Sets the tick which records when the entry was last moved about. |
| 503 | */ | 474 | */ |
| 504 | static void push(struct mq_policy *mq, struct entry *e) | 475 | static void push(struct mq_policy *mq, struct entry *e) |
| 505 | { | 476 | { |
| 506 | e->tick = mq->tick; | 477 | e->tick = mq->tick; |
| 507 | hash_insert(mq, e); | 478 | hash_insert(mq, e); |
| 508 | 479 | ||
| 509 | if (e->in_cache) { | 480 | if (in_cache(mq, e)) |
| 510 | alloc_cblock(mq, e->cblock); | 481 | queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean, |
| 511 | queue_push(&mq->cache, queue_level(e), &e->list); | 482 | queue_level(e), &e->list); |
| 512 | } else | 483 | else |
| 513 | queue_push(&mq->pre_cache, queue_level(e), &e->list); | 484 | queue_push(&mq->pre_cache, queue_level(e), &e->list); |
| 514 | } | 485 | } |
| 515 | 486 | ||
| 516 | /* | 487 | /* |
| 517 | * Removes an entry from pre_cache or cache. Removes from the hash table. | 488 | * Removes an entry from pre_cache or cache. Removes from the hash table. |
| 518 | * Frees off the cache block if necc. | ||
| 519 | */ | 489 | */ |
| 520 | static void del(struct mq_policy *mq, struct entry *e) | 490 | static void del(struct mq_policy *mq, struct entry *e) |
| 521 | { | 491 | { |
| 522 | queue_remove(&e->list); | 492 | queue_remove(&e->list); |
| 523 | hash_remove(e); | 493 | hash_remove(e); |
| 524 | if (e->in_cache) | ||
| 525 | free_cblock(mq, e->cblock); | ||
| 526 | } | 494 | } |
| 527 | 495 | ||
| 528 | /* | 496 | /* |
| @@ -531,14 +499,14 @@ static void del(struct mq_policy *mq, struct entry *e) | |||
| 531 | */ | 499 | */ |
| 532 | static struct entry *pop(struct mq_policy *mq, struct queue *q) | 500 | static struct entry *pop(struct mq_policy *mq, struct queue *q) |
| 533 | { | 501 | { |
| 534 | struct entry *e = container_of(queue_pop(q), struct entry, list); | 502 | struct entry *e; |
| 503 | struct list_head *h = queue_pop(q); | ||
| 535 | 504 | ||
| 536 | if (e) { | 505 | if (!h) |
| 537 | hash_remove(e); | 506 | return NULL; |
| 538 | 507 | ||
| 539 | if (e->in_cache) | 508 | e = container_of(h, struct entry, list); |
| 540 | free_cblock(mq, e->cblock); | 509 | hash_remove(e); |
| 541 | } | ||
| 542 | 510 | ||
| 543 | return e; | 511 | return e; |
| 544 | } | 512 | } |
| @@ -556,7 +524,8 @@ static bool updated_this_tick(struct mq_policy *mq, struct entry *e) | |||
| 556 | * of the entries. | 524 | * of the entries. |
| 557 | * | 525 | * |
| 558 | * At the moment the threshold is taken by averaging the hit counts of some | 526 | * At the moment the threshold is taken by averaging the hit counts of some |
| 559 | * of the entries in the cache (the first 20 entries of the first level). | 527 | * of the entries in the cache (the first 20 entries across all levels in |
| 528 | * ascending order, giving preference to the clean entries at each level). | ||
| 560 | * | 529 | * |
| 561 | * We can be much cleverer than this though. For example, each promotion | 530 | * We can be much cleverer than this though. For example, each promotion |
| 562 | * could bump up the threshold helping to prevent churn. Much more to do | 531 | * could bump up the threshold helping to prevent churn. Much more to do |
| @@ -571,14 +540,21 @@ static void check_generation(struct mq_policy *mq) | |||
| 571 | struct list_head *head; | 540 | struct list_head *head; |
| 572 | struct entry *e; | 541 | struct entry *e; |
| 573 | 542 | ||
| 574 | if ((mq->hit_count >= mq->generation_period) && | 543 | if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) { |
| 575 | (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) { | ||
| 576 | |||
| 577 | mq->hit_count = 0; | 544 | mq->hit_count = 0; |
| 578 | mq->generation++; | 545 | mq->generation++; |
| 579 | 546 | ||
| 580 | for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { | 547 | for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { |
| 581 | head = mq->cache.qs + level; | 548 | head = mq->cache_clean.qs + level; |
| 549 | list_for_each_entry(e, head, list) { | ||
| 550 | nr++; | ||
| 551 | total += e->hit_count; | ||
| 552 | |||
| 553 | if (++count >= MAX_TO_AVERAGE) | ||
| 554 | break; | ||
| 555 | } | ||
| 556 | |||
| 557 | head = mq->cache_dirty.qs + level; | ||
| 582 | list_for_each_entry(e, head, list) { | 558 | list_for_each_entry(e, head, list) { |
| 583 | nr++; | 559 | nr++; |
| 584 | total += e->hit_count; | 560 | total += e->hit_count; |
| @@ -631,19 +607,30 @@ static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e) | |||
| 631 | * - set the hit count to a hard coded value other than 1, eg, is it better | 607 | * - set the hit count to a hard coded value other than 1, eg, is it better |
| 632 | * if it goes in at level 2? | 608 | * if it goes in at level 2? |
| 633 | */ | 609 | */ |
| 634 | static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) | 610 | static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) |
| 635 | { | 611 | { |
| 636 | dm_cblock_t result; | 612 | struct entry *demoted = pop(mq, &mq->cache_clean); |
| 637 | struct entry *demoted = pop(mq, &mq->cache); | 613 | |
| 614 | if (!demoted) | ||
| 615 | /* | ||
| 616 | * We could get a block from mq->cache_dirty, but that | ||
| 617 | * would add extra latency to the triggering bio as it | ||
| 618 | * waits for the writeback. Better to not promote this | ||
| 619 | * time and hope there's a clean block next time this block | ||
| 620 | * is hit. | ||
| 621 | */ | ||
| 622 | return -ENOSPC; | ||
| 638 | 623 | ||
| 639 | BUG_ON(!demoted); | ||
| 640 | result = demoted->cblock; | ||
| 641 | *oblock = demoted->oblock; | 624 | *oblock = demoted->oblock; |
| 642 | demoted->in_cache = false; | 625 | free_entry(&mq->cache_pool, demoted); |
| 643 | demoted->hit_count = 1; | 626 | |
| 644 | push(mq, demoted); | 627 | /* |
| 628 | * We used to put the demoted block into the pre-cache, but I think | ||
| 629 | * it's simpler to just let it work it's way up from zero again. | ||
| 630 | * Stops blocks flickering in and out of the cache. | ||
| 631 | */ | ||
| 645 | 632 | ||
| 646 | return result; | 633 | return 0; |
| 647 | } | 634 | } |
| 648 | 635 | ||
| 649 | /* | 636 | /* |
| @@ -662,17 +649,18 @@ static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) | |||
| 662 | static unsigned adjusted_promote_threshold(struct mq_policy *mq, | 649 | static unsigned adjusted_promote_threshold(struct mq_policy *mq, |
| 663 | bool discarded_oblock, int data_dir) | 650 | bool discarded_oblock, int data_dir) |
| 664 | { | 651 | { |
| 665 | if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE) | 652 | if (data_dir == READ) |
| 653 | return mq->promote_threshold + READ_PROMOTE_THRESHOLD; | ||
| 654 | |||
| 655 | if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { | ||
| 666 | /* | 656 | /* |
| 667 | * We don't need to do any copying at all, so give this a | 657 | * We don't need to do any copying at all, so give this a |
| 668 | * very low threshold. In practice this only triggers | 658 | * very low threshold. |
| 669 | * during initial population after a format. | ||
| 670 | */ | 659 | */ |
| 671 | return DISCARDED_PROMOTE_THRESHOLD; | 660 | return DISCARDED_PROMOTE_THRESHOLD; |
| 661 | } | ||
| 672 | 662 | ||
| 673 | return data_dir == READ ? | 663 | return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD; |
| 674 | (mq->promote_threshold + READ_PROMOTE_THRESHOLD) : | ||
| 675 | (mq->promote_threshold + WRITE_PROMOTE_THRESHOLD); | ||
| 676 | } | 664 | } |
| 677 | 665 | ||
| 678 | static bool should_promote(struct mq_policy *mq, struct entry *e, | 666 | static bool should_promote(struct mq_policy *mq, struct entry *e, |
| @@ -688,34 +676,49 @@ static int cache_entry_found(struct mq_policy *mq, | |||
| 688 | { | 676 | { |
| 689 | requeue_and_update_tick(mq, e); | 677 | requeue_and_update_tick(mq, e); |
| 690 | 678 | ||
| 691 | if (e->in_cache) { | 679 | if (in_cache(mq, e)) { |
| 692 | result->op = POLICY_HIT; | 680 | result->op = POLICY_HIT; |
| 693 | result->cblock = e->cblock; | 681 | result->cblock = infer_cblock(&mq->cache_pool, e); |
| 694 | } | 682 | } |
| 695 | 683 | ||
| 696 | return 0; | 684 | return 0; |
| 697 | } | 685 | } |
| 698 | 686 | ||
| 699 | /* | 687 | /* |
| 700 | * Moves and entry from the pre_cache to the cache. The main work is | 688 | * Moves an entry from the pre_cache to the cache. The main work is |
| 701 | * finding which cache block to use. | 689 | * finding which cache block to use. |
| 702 | */ | 690 | */ |
| 703 | static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, | 691 | static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, |
| 704 | struct policy_result *result) | 692 | struct policy_result *result) |
| 705 | { | 693 | { |
| 706 | dm_cblock_t cblock; | 694 | int r; |
| 695 | struct entry *new_e; | ||
| 707 | 696 | ||
| 708 | if (find_free_cblock(mq, &cblock) == -ENOSPC) { | 697 | /* Ensure there's a free cblock in the cache */ |
| 698 | if (epool_empty(&mq->cache_pool)) { | ||
| 709 | result->op = POLICY_REPLACE; | 699 | result->op = POLICY_REPLACE; |
| 710 | cblock = demote_cblock(mq, &result->old_oblock); | 700 | r = demote_cblock(mq, &result->old_oblock); |
| 701 | if (r) { | ||
| 702 | result->op = POLICY_MISS; | ||
| 703 | return 0; | ||
| 704 | } | ||
| 711 | } else | 705 | } else |
| 712 | result->op = POLICY_NEW; | 706 | result->op = POLICY_NEW; |
| 713 | 707 | ||
| 714 | result->cblock = e->cblock = cblock; | 708 | new_e = alloc_entry(&mq->cache_pool); |
| 709 | BUG_ON(!new_e); | ||
| 710 | |||
| 711 | new_e->oblock = e->oblock; | ||
| 712 | new_e->dirty = false; | ||
| 713 | new_e->hit_count = e->hit_count; | ||
| 714 | new_e->generation = e->generation; | ||
| 715 | new_e->tick = e->tick; | ||
| 715 | 716 | ||
| 716 | del(mq, e); | 717 | del(mq, e); |
| 717 | e->in_cache = true; | 718 | free_entry(&mq->pre_cache_pool, e); |
| 718 | push(mq, e); | 719 | push(mq, new_e); |
| 720 | |||
| 721 | result->cblock = infer_cblock(&mq->cache_pool, new_e); | ||
| 719 | 722 | ||
| 720 | return 0; | 723 | return 0; |
| 721 | } | 724 | } |
| @@ -743,7 +746,7 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, | |||
| 743 | static void insert_in_pre_cache(struct mq_policy *mq, | 746 | static void insert_in_pre_cache(struct mq_policy *mq, |
| 744 | dm_oblock_t oblock) | 747 | dm_oblock_t oblock) |
| 745 | { | 748 | { |
| 746 | struct entry *e = alloc_entry(mq); | 749 | struct entry *e = alloc_entry(&mq->pre_cache_pool); |
| 747 | 750 | ||
| 748 | if (!e) | 751 | if (!e) |
| 749 | /* | 752 | /* |
| @@ -757,7 +760,7 @@ static void insert_in_pre_cache(struct mq_policy *mq, | |||
| 757 | return; | 760 | return; |
| 758 | } | 761 | } |
| 759 | 762 | ||
| 760 | e->in_cache = false; | 763 | e->dirty = false; |
| 761 | e->oblock = oblock; | 764 | e->oblock = oblock; |
| 762 | e->hit_count = 1; | 765 | e->hit_count = 1; |
| 763 | e->generation = mq->generation; | 766 | e->generation = mq->generation; |
| @@ -767,30 +770,36 @@ static void insert_in_pre_cache(struct mq_policy *mq, | |||
| 767 | static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, | 770 | static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, |
| 768 | struct policy_result *result) | 771 | struct policy_result *result) |
| 769 | { | 772 | { |
| 773 | int r; | ||
| 770 | struct entry *e; | 774 | struct entry *e; |
| 771 | dm_cblock_t cblock; | ||
| 772 | 775 | ||
| 773 | if (find_free_cblock(mq, &cblock) == -ENOSPC) { | 776 | if (epool_empty(&mq->cache_pool)) { |
| 774 | result->op = POLICY_MISS; | 777 | result->op = POLICY_REPLACE; |
| 775 | insert_in_pre_cache(mq, oblock); | 778 | r = demote_cblock(mq, &result->old_oblock); |
| 776 | return; | 779 | if (unlikely(r)) { |
| 777 | } | 780 | result->op = POLICY_MISS; |
| 781 | insert_in_pre_cache(mq, oblock); | ||
| 782 | return; | ||
| 783 | } | ||
| 778 | 784 | ||
| 779 | e = alloc_entry(mq); | 785 | /* |
| 780 | if (unlikely(!e)) { | 786 | * This will always succeed, since we've just demoted. |
| 781 | result->op = POLICY_MISS; | 787 | */ |
| 782 | return; | 788 | e = alloc_entry(&mq->cache_pool); |
| 789 | BUG_ON(!e); | ||
| 790 | |||
| 791 | } else { | ||
| 792 | e = alloc_entry(&mq->cache_pool); | ||
| 793 | result->op = POLICY_NEW; | ||
| 783 | } | 794 | } |
| 784 | 795 | ||
| 785 | e->oblock = oblock; | 796 | e->oblock = oblock; |
| 786 | e->cblock = cblock; | 797 | e->dirty = false; |
| 787 | e->in_cache = true; | ||
| 788 | e->hit_count = 1; | 798 | e->hit_count = 1; |
| 789 | e->generation = mq->generation; | 799 | e->generation = mq->generation; |
| 790 | push(mq, e); | 800 | push(mq, e); |
| 791 | 801 | ||
| 792 | result->op = POLICY_NEW; | 802 | result->cblock = infer_cblock(&mq->cache_pool, e); |
| 793 | result->cblock = e->cblock; | ||
| 794 | } | 803 | } |
| 795 | 804 | ||
| 796 | static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, | 805 | static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, |
| @@ -821,13 +830,16 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock, | |||
| 821 | int r = 0; | 830 | int r = 0; |
| 822 | struct entry *e = hash_lookup(mq, oblock); | 831 | struct entry *e = hash_lookup(mq, oblock); |
| 823 | 832 | ||
| 824 | if (e && e->in_cache) | 833 | if (e && in_cache(mq, e)) |
| 825 | r = cache_entry_found(mq, e, result); | 834 | r = cache_entry_found(mq, e, result); |
| 835 | |||
| 826 | else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) | 836 | else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) |
| 827 | result->op = POLICY_MISS; | 837 | result->op = POLICY_MISS; |
| 838 | |||
| 828 | else if (e) | 839 | else if (e) |
| 829 | r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, | 840 | r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, |
| 830 | data_dir, result); | 841 | data_dir, result); |
| 842 | |||
| 831 | else | 843 | else |
| 832 | r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, | 844 | r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, |
| 833 | data_dir, result); | 845 | data_dir, result); |
| @@ -854,9 +866,9 @@ static void mq_destroy(struct dm_cache_policy *p) | |||
| 854 | { | 866 | { |
| 855 | struct mq_policy *mq = to_mq_policy(p); | 867 | struct mq_policy *mq = to_mq_policy(p); |
| 856 | 868 | ||
| 857 | free_bitset(mq->allocation_bitset); | ||
| 858 | kfree(mq->table); | 869 | kfree(mq->table); |
| 859 | free_entries(mq); | 870 | epool_exit(&mq->cache_pool); |
| 871 | epool_exit(&mq->pre_cache_pool); | ||
| 860 | kfree(mq); | 872 | kfree(mq); |
| 861 | } | 873 | } |
| 862 | 874 | ||
| @@ -904,8 +916,8 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t | |||
| 904 | return -EWOULDBLOCK; | 916 | return -EWOULDBLOCK; |
| 905 | 917 | ||
| 906 | e = hash_lookup(mq, oblock); | 918 | e = hash_lookup(mq, oblock); |
| 907 | if (e && e->in_cache) { | 919 | if (e && in_cache(mq, e)) { |
| 908 | *cblock = e->cblock; | 920 | *cblock = infer_cblock(&mq->cache_pool, e); |
| 909 | r = 0; | 921 | r = 0; |
| 910 | } else | 922 | } else |
| 911 | r = -ENOENT; | 923 | r = -ENOENT; |
| @@ -915,6 +927,36 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t | |||
| 915 | return r; | 927 | return r; |
| 916 | } | 928 | } |
| 917 | 929 | ||
| 930 | static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set) | ||
| 931 | { | ||
| 932 | struct entry *e; | ||
| 933 | |||
| 934 | e = hash_lookup(mq, oblock); | ||
| 935 | BUG_ON(!e || !in_cache(mq, e)); | ||
| 936 | |||
| 937 | del(mq, e); | ||
| 938 | e->dirty = set; | ||
| 939 | push(mq, e); | ||
| 940 | } | ||
| 941 | |||
| 942 | static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) | ||
| 943 | { | ||
| 944 | struct mq_policy *mq = to_mq_policy(p); | ||
| 945 | |||
| 946 | mutex_lock(&mq->lock); | ||
| 947 | __mq_set_clear_dirty(mq, oblock, true); | ||
| 948 | mutex_unlock(&mq->lock); | ||
| 949 | } | ||
| 950 | |||
| 951 | static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) | ||
| 952 | { | ||
| 953 | struct mq_policy *mq = to_mq_policy(p); | ||
| 954 | |||
| 955 | mutex_lock(&mq->lock); | ||
| 956 | __mq_set_clear_dirty(mq, oblock, false); | ||
| 957 | mutex_unlock(&mq->lock); | ||
| 958 | } | ||
| 959 | |||
| 918 | static int mq_load_mapping(struct dm_cache_policy *p, | 960 | static int mq_load_mapping(struct dm_cache_policy *p, |
| 919 | dm_oblock_t oblock, dm_cblock_t cblock, | 961 | dm_oblock_t oblock, dm_cblock_t cblock, |
| 920 | uint32_t hint, bool hint_valid) | 962 | uint32_t hint, bool hint_valid) |
| @@ -922,13 +964,9 @@ static int mq_load_mapping(struct dm_cache_policy *p, | |||
| 922 | struct mq_policy *mq = to_mq_policy(p); | 964 | struct mq_policy *mq = to_mq_policy(p); |
| 923 | struct entry *e; | 965 | struct entry *e; |
| 924 | 966 | ||
| 925 | e = alloc_entry(mq); | 967 | e = alloc_particular_entry(&mq->cache_pool, cblock); |
| 926 | if (!e) | ||
| 927 | return -ENOMEM; | ||
| 928 | |||
| 929 | e->cblock = cblock; | ||
| 930 | e->oblock = oblock; | 968 | e->oblock = oblock; |
| 931 | e->in_cache = true; | 969 | e->dirty = false; /* this gets corrected in a minute */ |
| 932 | e->hit_count = hint_valid ? hint : 1; | 970 | e->hit_count = hint_valid ? hint : 1; |
| 933 | e->generation = mq->generation; | 971 | e->generation = mq->generation; |
| 934 | push(mq, e); | 972 | push(mq, e); |
| @@ -936,57 +974,126 @@ static int mq_load_mapping(struct dm_cache_policy *p, | |||
| 936 | return 0; | 974 | return 0; |
| 937 | } | 975 | } |
| 938 | 976 | ||
| 977 | static int mq_save_hints(struct mq_policy *mq, struct queue *q, | ||
| 978 | policy_walk_fn fn, void *context) | ||
| 979 | { | ||
| 980 | int r; | ||
| 981 | unsigned level; | ||
| 982 | struct entry *e; | ||
| 983 | |||
| 984 | for (level = 0; level < NR_QUEUE_LEVELS; level++) | ||
| 985 | list_for_each_entry(e, q->qs + level, list) { | ||
| 986 | r = fn(context, infer_cblock(&mq->cache_pool, e), | ||
| 987 | e->oblock, e->hit_count); | ||
| 988 | if (r) | ||
| 989 | return r; | ||
| 990 | } | ||
| 991 | |||
| 992 | return 0; | ||
| 993 | } | ||
| 994 | |||
| 939 | static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, | 995 | static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, |
| 940 | void *context) | 996 | void *context) |
| 941 | { | 997 | { |
| 942 | struct mq_policy *mq = to_mq_policy(p); | 998 | struct mq_policy *mq = to_mq_policy(p); |
| 943 | int r = 0; | 999 | int r = 0; |
| 944 | struct entry *e; | ||
| 945 | unsigned level; | ||
| 946 | 1000 | ||
| 947 | mutex_lock(&mq->lock); | 1001 | mutex_lock(&mq->lock); |
| 948 | 1002 | ||
| 949 | for (level = 0; level < NR_QUEUE_LEVELS; level++) | 1003 | r = mq_save_hints(mq, &mq->cache_clean, fn, context); |
| 950 | list_for_each_entry(e, &mq->cache.qs[level], list) { | 1004 | if (!r) |
| 951 | r = fn(context, e->cblock, e->oblock, e->hit_count); | 1005 | r = mq_save_hints(mq, &mq->cache_dirty, fn, context); |
| 952 | if (r) | ||
| 953 | goto out; | ||
| 954 | } | ||
| 955 | 1006 | ||
| 956 | out: | ||
| 957 | mutex_unlock(&mq->lock); | 1007 | mutex_unlock(&mq->lock); |
| 958 | 1008 | ||
| 959 | return r; | 1009 | return r; |
| 960 | } | 1010 | } |
| 961 | 1011 | ||
| 1012 | static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) | ||
| 1013 | { | ||
| 1014 | struct entry *e; | ||
| 1015 | |||
| 1016 | e = hash_lookup(mq, oblock); | ||
| 1017 | BUG_ON(!e || !in_cache(mq, e)); | ||
| 1018 | |||
| 1019 | del(mq, e); | ||
| 1020 | free_entry(&mq->cache_pool, e); | ||
| 1021 | } | ||
| 1022 | |||
| 962 | static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) | 1023 | static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) |
| 963 | { | 1024 | { |
| 964 | struct mq_policy *mq = to_mq_policy(p); | 1025 | struct mq_policy *mq = to_mq_policy(p); |
| 965 | struct entry *e; | ||
| 966 | 1026 | ||
| 967 | mutex_lock(&mq->lock); | 1027 | mutex_lock(&mq->lock); |
| 1028 | __remove_mapping(mq, oblock); | ||
| 1029 | mutex_unlock(&mq->lock); | ||
| 1030 | } | ||
| 968 | 1031 | ||
| 969 | e = hash_lookup(mq, oblock); | 1032 | static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock) |
| 1033 | { | ||
| 1034 | struct entry *e = epool_find(&mq->cache_pool, cblock); | ||
| 970 | 1035 | ||
| 971 | BUG_ON(!e || !e->in_cache); | 1036 | if (!e) |
| 1037 | return -ENODATA; | ||
| 972 | 1038 | ||
| 973 | del(mq, e); | 1039 | del(mq, e); |
| 974 | e->in_cache = false; | 1040 | free_entry(&mq->cache_pool, e); |
| 975 | push(mq, e); | ||
| 976 | 1041 | ||
| 1042 | return 0; | ||
| 1043 | } | ||
| 1044 | |||
| 1045 | static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) | ||
| 1046 | { | ||
| 1047 | int r; | ||
| 1048 | struct mq_policy *mq = to_mq_policy(p); | ||
| 1049 | |||
| 1050 | mutex_lock(&mq->lock); | ||
| 1051 | r = __remove_cblock(mq, cblock); | ||
| 977 | mutex_unlock(&mq->lock); | 1052 | mutex_unlock(&mq->lock); |
| 1053 | |||
| 1054 | return r; | ||
| 978 | } | 1055 | } |
| 979 | 1056 | ||
| 980 | static void force_mapping(struct mq_policy *mq, | 1057 | static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock, |
| 981 | dm_oblock_t current_oblock, dm_oblock_t new_oblock) | 1058 | dm_cblock_t *cblock) |
| 982 | { | 1059 | { |
| 983 | struct entry *e = hash_lookup(mq, current_oblock); | 1060 | struct entry *e = pop(mq, &mq->cache_dirty); |
| 984 | 1061 | ||
| 985 | BUG_ON(!e || !e->in_cache); | 1062 | if (!e) |
| 1063 | return -ENODATA; | ||
| 986 | 1064 | ||
| 987 | del(mq, e); | 1065 | *oblock = e->oblock; |
| 988 | e->oblock = new_oblock; | 1066 | *cblock = infer_cblock(&mq->cache_pool, e); |
| 1067 | e->dirty = false; | ||
| 989 | push(mq, e); | 1068 | push(mq, e); |
| 1069 | |||
| 1070 | return 0; | ||
| 1071 | } | ||
| 1072 | |||
| 1073 | static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, | ||
| 1074 | dm_cblock_t *cblock) | ||
| 1075 | { | ||
| 1076 | int r; | ||
| 1077 | struct mq_policy *mq = to_mq_policy(p); | ||
| 1078 | |||
| 1079 | mutex_lock(&mq->lock); | ||
| 1080 | r = __mq_writeback_work(mq, oblock, cblock); | ||
| 1081 | mutex_unlock(&mq->lock); | ||
| 1082 | |||
| 1083 | return r; | ||
| 1084 | } | ||
| 1085 | |||
| 1086 | static void __force_mapping(struct mq_policy *mq, | ||
| 1087 | dm_oblock_t current_oblock, dm_oblock_t new_oblock) | ||
| 1088 | { | ||
| 1089 | struct entry *e = hash_lookup(mq, current_oblock); | ||
| 1090 | |||
| 1091 | if (e && in_cache(mq, e)) { | ||
| 1092 | del(mq, e); | ||
| 1093 | e->oblock = new_oblock; | ||
| 1094 | e->dirty = true; | ||
| 1095 | push(mq, e); | ||
| 1096 | } | ||
| 990 | } | 1097 | } |
| 991 | 1098 | ||
| 992 | static void mq_force_mapping(struct dm_cache_policy *p, | 1099 | static void mq_force_mapping(struct dm_cache_policy *p, |
| @@ -995,16 +1102,20 @@ static void mq_force_mapping(struct dm_cache_policy *p, | |||
| 995 | struct mq_policy *mq = to_mq_policy(p); | 1102 | struct mq_policy *mq = to_mq_policy(p); |
| 996 | 1103 | ||
| 997 | mutex_lock(&mq->lock); | 1104 | mutex_lock(&mq->lock); |
| 998 | force_mapping(mq, current_oblock, new_oblock); | 1105 | __force_mapping(mq, current_oblock, new_oblock); |
| 999 | mutex_unlock(&mq->lock); | 1106 | mutex_unlock(&mq->lock); |
| 1000 | } | 1107 | } |
| 1001 | 1108 | ||
| 1002 | static dm_cblock_t mq_residency(struct dm_cache_policy *p) | 1109 | static dm_cblock_t mq_residency(struct dm_cache_policy *p) |
| 1003 | { | 1110 | { |
| 1111 | dm_cblock_t r; | ||
| 1004 | struct mq_policy *mq = to_mq_policy(p); | 1112 | struct mq_policy *mq = to_mq_policy(p); |
| 1005 | 1113 | ||
| 1006 | /* FIXME: lock mutex, not sure we can block here */ | 1114 | mutex_lock(&mq->lock); |
| 1007 | return to_cblock(mq->nr_cblocks_allocated); | 1115 | r = to_cblock(mq->cache_pool.nr_allocated); |
| 1116 | mutex_unlock(&mq->lock); | ||
| 1117 | |||
| 1118 | return r; | ||
| 1008 | } | 1119 | } |
| 1009 | 1120 | ||
| 1010 | static void mq_tick(struct dm_cache_policy *p) | 1121 | static void mq_tick(struct dm_cache_policy *p) |
| @@ -1057,10 +1168,13 @@ static void init_policy_functions(struct mq_policy *mq) | |||
| 1057 | mq->policy.destroy = mq_destroy; | 1168 | mq->policy.destroy = mq_destroy; |
| 1058 | mq->policy.map = mq_map; | 1169 | mq->policy.map = mq_map; |
| 1059 | mq->policy.lookup = mq_lookup; | 1170 | mq->policy.lookup = mq_lookup; |
| 1171 | mq->policy.set_dirty = mq_set_dirty; | ||
| 1172 | mq->policy.clear_dirty = mq_clear_dirty; | ||
| 1060 | mq->policy.load_mapping = mq_load_mapping; | 1173 | mq->policy.load_mapping = mq_load_mapping; |
| 1061 | mq->policy.walk_mappings = mq_walk_mappings; | 1174 | mq->policy.walk_mappings = mq_walk_mappings; |
| 1062 | mq->policy.remove_mapping = mq_remove_mapping; | 1175 | mq->policy.remove_mapping = mq_remove_mapping; |
| 1063 | mq->policy.writeback_work = NULL; | 1176 | mq->policy.remove_cblock = mq_remove_cblock; |
| 1177 | mq->policy.writeback_work = mq_writeback_work; | ||
| 1064 | mq->policy.force_mapping = mq_force_mapping; | 1178 | mq->policy.force_mapping = mq_force_mapping; |
| 1065 | mq->policy.residency = mq_residency; | 1179 | mq->policy.residency = mq_residency; |
| 1066 | mq->policy.tick = mq_tick; | 1180 | mq->policy.tick = mq_tick; |
| @@ -1072,7 +1186,6 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, | |||
| 1072 | sector_t origin_size, | 1186 | sector_t origin_size, |
| 1073 | sector_t cache_block_size) | 1187 | sector_t cache_block_size) |
| 1074 | { | 1188 | { |
| 1075 | int r; | ||
| 1076 | struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); | 1189 | struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); |
| 1077 | 1190 | ||
| 1078 | if (!mq) | 1191 | if (!mq) |
| @@ -1080,8 +1193,18 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, | |||
| 1080 | 1193 | ||
| 1081 | init_policy_functions(mq); | 1194 | init_policy_functions(mq); |
| 1082 | iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT); | 1195 | iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT); |
| 1083 | |||
| 1084 | mq->cache_size = cache_size; | 1196 | mq->cache_size = cache_size; |
| 1197 | |||
| 1198 | if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) { | ||
| 1199 | DMERR("couldn't initialize pool of pre-cache entries"); | ||
| 1200 | goto bad_pre_cache_init; | ||
| 1201 | } | ||
| 1202 | |||
| 1203 | if (epool_init(&mq->cache_pool, from_cblock(cache_size))) { | ||
| 1204 | DMERR("couldn't initialize pool of cache entries"); | ||
| 1205 | goto bad_cache_init; | ||
| 1206 | } | ||
| 1207 | |||
| 1085 | mq->tick_protected = 0; | 1208 | mq->tick_protected = 0; |
| 1086 | mq->tick = 0; | 1209 | mq->tick = 0; |
| 1087 | mq->hit_count = 0; | 1210 | mq->hit_count = 0; |
| @@ -1089,20 +1212,12 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, | |||
| 1089 | mq->promote_threshold = 0; | 1212 | mq->promote_threshold = 0; |
| 1090 | mutex_init(&mq->lock); | 1213 | mutex_init(&mq->lock); |
| 1091 | spin_lock_init(&mq->tick_lock); | 1214 | spin_lock_init(&mq->tick_lock); |
| 1092 | mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG); | ||
| 1093 | mq->find_free_last_word = 0; | ||
| 1094 | 1215 | ||
| 1095 | queue_init(&mq->pre_cache); | 1216 | queue_init(&mq->pre_cache); |
| 1096 | queue_init(&mq->cache); | 1217 | queue_init(&mq->cache_clean); |
| 1097 | mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); | 1218 | queue_init(&mq->cache_dirty); |
| 1098 | 1219 | ||
| 1099 | mq->nr_entries = 2 * from_cblock(cache_size); | 1220 | mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); |
| 1100 | r = alloc_entries(mq, mq->nr_entries); | ||
| 1101 | if (r) | ||
| 1102 | goto bad_cache_alloc; | ||
| 1103 | |||
| 1104 | mq->nr_entries_allocated = 0; | ||
| 1105 | mq->nr_cblocks_allocated = 0; | ||
| 1106 | 1221 | ||
| 1107 | mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); | 1222 | mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); |
| 1108 | mq->hash_bits = ffs(mq->nr_buckets) - 1; | 1223 | mq->hash_bits = ffs(mq->nr_buckets) - 1; |
| @@ -1110,17 +1225,13 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, | |||
| 1110 | if (!mq->table) | 1225 | if (!mq->table) |
| 1111 | goto bad_alloc_table; | 1226 | goto bad_alloc_table; |
| 1112 | 1227 | ||
| 1113 | mq->allocation_bitset = alloc_bitset(from_cblock(cache_size)); | ||
| 1114 | if (!mq->allocation_bitset) | ||
| 1115 | goto bad_alloc_bitset; | ||
| 1116 | |||
| 1117 | return &mq->policy; | 1228 | return &mq->policy; |
| 1118 | 1229 | ||
| 1119 | bad_alloc_bitset: | ||
| 1120 | kfree(mq->table); | ||
| 1121 | bad_alloc_table: | 1230 | bad_alloc_table: |
| 1122 | free_entries(mq); | 1231 | epool_exit(&mq->cache_pool); |
| 1123 | bad_cache_alloc: | 1232 | bad_cache_init: |
| 1233 | epool_exit(&mq->pre_cache_pool); | ||
| 1234 | bad_pre_cache_init: | ||
| 1124 | kfree(mq); | 1235 | kfree(mq); |
| 1125 | 1236 | ||
| 1126 | return NULL; | 1237 | return NULL; |
| @@ -1130,7 +1241,7 @@ bad_cache_alloc: | |||
| 1130 | 1241 | ||
| 1131 | static struct dm_cache_policy_type mq_policy_type = { | 1242 | static struct dm_cache_policy_type mq_policy_type = { |
| 1132 | .name = "mq", | 1243 | .name = "mq", |
| 1133 | .version = {1, 0, 0}, | 1244 | .version = {1, 1, 0}, |
| 1134 | .hint_size = 4, | 1245 | .hint_size = 4, |
| 1135 | .owner = THIS_MODULE, | 1246 | .owner = THIS_MODULE, |
| 1136 | .create = mq_create | 1247 | .create = mq_create |
| @@ -1138,7 +1249,7 @@ static struct dm_cache_policy_type mq_policy_type = { | |||
| 1138 | 1249 | ||
| 1139 | static struct dm_cache_policy_type default_policy_type = { | 1250 | static struct dm_cache_policy_type default_policy_type = { |
| 1140 | .name = "default", | 1251 | .name = "default", |
| 1141 | .version = {1, 0, 0}, | 1252 | .version = {1, 1, 0}, |
| 1142 | .hint_size = 4, | 1253 | .hint_size = 4, |
| 1143 | .owner = THIS_MODULE, | 1254 | .owner = THIS_MODULE, |
| 1144 | .create = mq_create | 1255 | .create = mq_create |
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c index 21c03c570c06..d80057968407 100644 --- a/drivers/md/dm-cache-policy.c +++ b/drivers/md/dm-cache-policy.c | |||
| @@ -119,13 +119,13 @@ struct dm_cache_policy *dm_cache_policy_create(const char *name, | |||
| 119 | type = get_policy(name); | 119 | type = get_policy(name); |
| 120 | if (!type) { | 120 | if (!type) { |
| 121 | DMWARN("unknown policy type"); | 121 | DMWARN("unknown policy type"); |
| 122 | return NULL; | 122 | return ERR_PTR(-EINVAL); |
| 123 | } | 123 | } |
| 124 | 124 | ||
| 125 | p = type->create(cache_size, origin_size, cache_block_size); | 125 | p = type->create(cache_size, origin_size, cache_block_size); |
| 126 | if (!p) { | 126 | if (!p) { |
| 127 | put_policy(type); | 127 | put_policy(type); |
| 128 | return NULL; | 128 | return ERR_PTR(-ENOMEM); |
| 129 | } | 129 | } |
| 130 | p->private = type; | 130 | p->private = type; |
| 131 | 131 | ||
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index 33369ca9614f..052c00a84a5c 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h | |||
| @@ -135,9 +135,6 @@ struct dm_cache_policy { | |||
| 135 | */ | 135 | */ |
| 136 | int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); | 136 | int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); |
| 137 | 137 | ||
| 138 | /* | ||
| 139 | * oblock must be a mapped block. Must not block. | ||
| 140 | */ | ||
| 141 | void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); | 138 | void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); |
| 142 | void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); | 139 | void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); |
| 143 | 140 | ||
| @@ -159,8 +156,24 @@ struct dm_cache_policy { | |||
| 159 | void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock, | 156 | void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock, |
| 160 | dm_oblock_t new_oblock); | 157 | dm_oblock_t new_oblock); |
| 161 | 158 | ||
| 162 | int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock); | 159 | /* |
| 160 | * This is called via the invalidate_cblocks message. It is | ||
| 161 | * possible the particular cblock has already been removed due to a | ||
| 162 | * write io in passthrough mode. In which case this should return | ||
| 163 | * -ENODATA. | ||
| 164 | */ | ||
| 165 | int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock); | ||
| 163 | 166 | ||
| 167 | /* | ||
| 168 | * Provide a dirty block to be written back by the core target. | ||
| 169 | * | ||
| 170 | * Returns: | ||
| 171 | * | ||
| 172 | * 0 and @cblock,@oblock: block to write back provided | ||
| 173 | * | ||
| 174 | * -ENODATA: no dirty blocks available | ||
| 175 | */ | ||
| 176 | int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock); | ||
| 164 | 177 | ||
| 165 | /* | 178 | /* |
| 166 | * How full is the cache? | 179 | * How full is the cache? |
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 29569768ffbf..9efcf1059b99 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c | |||
| @@ -61,6 +61,34 @@ static void free_bitset(unsigned long *bits) | |||
| 61 | 61 | ||
| 62 | /*----------------------------------------------------------------*/ | 62 | /*----------------------------------------------------------------*/ |
| 63 | 63 | ||
| 64 | /* | ||
| 65 | * There are a couple of places where we let a bio run, but want to do some | ||
| 66 | * work before calling its endio function. We do this by temporarily | ||
| 67 | * changing the endio fn. | ||
| 68 | */ | ||
| 69 | struct dm_hook_info { | ||
| 70 | bio_end_io_t *bi_end_io; | ||
| 71 | void *bi_private; | ||
| 72 | }; | ||
| 73 | |||
| 74 | static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, | ||
| 75 | bio_end_io_t *bi_end_io, void *bi_private) | ||
| 76 | { | ||
| 77 | h->bi_end_io = bio->bi_end_io; | ||
| 78 | h->bi_private = bio->bi_private; | ||
| 79 | |||
| 80 | bio->bi_end_io = bi_end_io; | ||
| 81 | bio->bi_private = bi_private; | ||
| 82 | } | ||
| 83 | |||
| 84 | static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) | ||
| 85 | { | ||
| 86 | bio->bi_end_io = h->bi_end_io; | ||
| 87 | bio->bi_private = h->bi_private; | ||
| 88 | } | ||
| 89 | |||
| 90 | /*----------------------------------------------------------------*/ | ||
| 91 | |||
| 64 | #define PRISON_CELLS 1024 | 92 | #define PRISON_CELLS 1024 |
| 65 | #define MIGRATION_POOL_SIZE 128 | 93 | #define MIGRATION_POOL_SIZE 128 |
| 66 | #define COMMIT_PERIOD HZ | 94 | #define COMMIT_PERIOD HZ |
| @@ -76,14 +104,37 @@ static void free_bitset(unsigned long *bits) | |||
| 76 | /* | 104 | /* |
| 77 | * FIXME: the cache is read/write for the time being. | 105 | * FIXME: the cache is read/write for the time being. |
| 78 | */ | 106 | */ |
| 79 | enum cache_mode { | 107 | enum cache_metadata_mode { |
| 80 | CM_WRITE, /* metadata may be changed */ | 108 | CM_WRITE, /* metadata may be changed */ |
| 81 | CM_READ_ONLY, /* metadata may not be changed */ | 109 | CM_READ_ONLY, /* metadata may not be changed */ |
| 82 | }; | 110 | }; |
| 83 | 111 | ||
| 112 | enum cache_io_mode { | ||
| 113 | /* | ||
| 114 | * Data is written to cached blocks only. These blocks are marked | ||
| 115 | * dirty. If you lose the cache device you will lose data. | ||
| 116 | * Potential performance increase for both reads and writes. | ||
| 117 | */ | ||
| 118 | CM_IO_WRITEBACK, | ||
| 119 | |||
| 120 | /* | ||
| 121 | * Data is written to both cache and origin. Blocks are never | ||
| 122 | * dirty. Potential performance benfit for reads only. | ||
| 123 | */ | ||
| 124 | CM_IO_WRITETHROUGH, | ||
| 125 | |||
| 126 | /* | ||
| 127 | * A degraded mode useful for various cache coherency situations | ||
| 128 | * (eg, rolling back snapshots). Reads and writes always go to the | ||
| 129 | * origin. If a write goes to a cached oblock, then the cache | ||
| 130 | * block is invalidated. | ||
| 131 | */ | ||
| 132 | CM_IO_PASSTHROUGH | ||
| 133 | }; | ||
| 134 | |||
| 84 | struct cache_features { | 135 | struct cache_features { |
| 85 | enum cache_mode mode; | 136 | enum cache_metadata_mode mode; |
| 86 | bool write_through:1; | 137 | enum cache_io_mode io_mode; |
| 87 | }; | 138 | }; |
| 88 | 139 | ||
| 89 | struct cache_stats { | 140 | struct cache_stats { |
| @@ -99,6 +150,25 @@ struct cache_stats { | |||
| 99 | atomic_t discard_count; | 150 | atomic_t discard_count; |
| 100 | }; | 151 | }; |
| 101 | 152 | ||
| 153 | /* | ||
| 154 | * Defines a range of cblocks, begin to (end - 1) are in the range. end is | ||
| 155 | * the one-past-the-end value. | ||
| 156 | */ | ||
| 157 | struct cblock_range { | ||
| 158 | dm_cblock_t begin; | ||
| 159 | dm_cblock_t end; | ||
| 160 | }; | ||
| 161 | |||
| 162 | struct invalidation_request { | ||
| 163 | struct list_head list; | ||
| 164 | struct cblock_range *cblocks; | ||
| 165 | |||
| 166 | atomic_t complete; | ||
| 167 | int err; | ||
| 168 | |||
| 169 | wait_queue_head_t result_wait; | ||
| 170 | }; | ||
| 171 | |||
| 102 | struct cache { | 172 | struct cache { |
| 103 | struct dm_target *ti; | 173 | struct dm_target *ti; |
| 104 | struct dm_target_callbacks callbacks; | 174 | struct dm_target_callbacks callbacks; |
| @@ -148,6 +218,10 @@ struct cache { | |||
| 148 | wait_queue_head_t migration_wait; | 218 | wait_queue_head_t migration_wait; |
| 149 | atomic_t nr_migrations; | 219 | atomic_t nr_migrations; |
| 150 | 220 | ||
| 221 | wait_queue_head_t quiescing_wait; | ||
| 222 | atomic_t quiescing; | ||
| 223 | atomic_t quiescing_ack; | ||
| 224 | |||
| 151 | /* | 225 | /* |
| 152 | * cache_size entries, dirty if set | 226 | * cache_size entries, dirty if set |
| 153 | */ | 227 | */ |
| @@ -186,7 +260,7 @@ struct cache { | |||
| 186 | 260 | ||
| 187 | bool need_tick_bio:1; | 261 | bool need_tick_bio:1; |
| 188 | bool sized:1; | 262 | bool sized:1; |
| 189 | bool quiescing:1; | 263 | bool invalidate:1; |
| 190 | bool commit_requested:1; | 264 | bool commit_requested:1; |
| 191 | bool loaded_mappings:1; | 265 | bool loaded_mappings:1; |
| 192 | bool loaded_discards:1; | 266 | bool loaded_discards:1; |
| @@ -197,6 +271,12 @@ struct cache { | |||
| 197 | struct cache_features features; | 271 | struct cache_features features; |
| 198 | 272 | ||
| 199 | struct cache_stats stats; | 273 | struct cache_stats stats; |
| 274 | |||
| 275 | /* | ||
| 276 | * Invalidation fields. | ||
| 277 | */ | ||
| 278 | spinlock_t invalidation_lock; | ||
| 279 | struct list_head invalidation_requests; | ||
| 200 | }; | 280 | }; |
| 201 | 281 | ||
| 202 | struct per_bio_data { | 282 | struct per_bio_data { |
| @@ -211,7 +291,7 @@ struct per_bio_data { | |||
| 211 | */ | 291 | */ |
| 212 | struct cache *cache; | 292 | struct cache *cache; |
| 213 | dm_cblock_t cblock; | 293 | dm_cblock_t cblock; |
| 214 | bio_end_io_t *saved_bi_end_io; | 294 | struct dm_hook_info hook_info; |
| 215 | struct dm_bio_details bio_details; | 295 | struct dm_bio_details bio_details; |
| 216 | }; | 296 | }; |
| 217 | 297 | ||
| @@ -228,6 +308,8 @@ struct dm_cache_migration { | |||
| 228 | bool writeback:1; | 308 | bool writeback:1; |
| 229 | bool demote:1; | 309 | bool demote:1; |
| 230 | bool promote:1; | 310 | bool promote:1; |
| 311 | bool requeue_holder:1; | ||
| 312 | bool invalidate:1; | ||
| 231 | 313 | ||
| 232 | struct dm_bio_prison_cell *old_ocell; | 314 | struct dm_bio_prison_cell *old_ocell; |
| 233 | struct dm_bio_prison_cell *new_ocell; | 315 | struct dm_bio_prison_cell *new_ocell; |
| @@ -533,9 +615,24 @@ static void save_stats(struct cache *cache) | |||
| 533 | #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) | 615 | #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) |
| 534 | #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) | 616 | #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) |
| 535 | 617 | ||
| 618 | static bool writethrough_mode(struct cache_features *f) | ||
| 619 | { | ||
| 620 | return f->io_mode == CM_IO_WRITETHROUGH; | ||
| 621 | } | ||
| 622 | |||
| 623 | static bool writeback_mode(struct cache_features *f) | ||
| 624 | { | ||
| 625 | return f->io_mode == CM_IO_WRITEBACK; | ||
| 626 | } | ||
| 627 | |||
| 628 | static bool passthrough_mode(struct cache_features *f) | ||
| 629 | { | ||
| 630 | return f->io_mode == CM_IO_PASSTHROUGH; | ||
| 631 | } | ||
| 632 | |||
| 536 | static size_t get_per_bio_data_size(struct cache *cache) | 633 | static size_t get_per_bio_data_size(struct cache *cache) |
| 537 | { | 634 | { |
| 538 | return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; | 635 | return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; |
| 539 | } | 636 | } |
| 540 | 637 | ||
| 541 | static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) | 638 | static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) |
| @@ -605,6 +702,7 @@ static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, | |||
| 605 | static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, | 702 | static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, |
| 606 | dm_oblock_t oblock, dm_cblock_t cblock) | 703 | dm_oblock_t oblock, dm_cblock_t cblock) |
| 607 | { | 704 | { |
| 705 | check_if_tick_bio_needed(cache, bio); | ||
| 608 | remap_to_cache(cache, bio, cblock); | 706 | remap_to_cache(cache, bio, cblock); |
| 609 | if (bio_data_dir(bio) == WRITE) { | 707 | if (bio_data_dir(bio) == WRITE) { |
| 610 | set_dirty(cache, oblock, cblock); | 708 | set_dirty(cache, oblock, cblock); |
| @@ -662,7 +760,8 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio) | |||
| 662 | static void writethrough_endio(struct bio *bio, int err) | 760 | static void writethrough_endio(struct bio *bio, int err) |
| 663 | { | 761 | { |
| 664 | struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); | 762 | struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); |
| 665 | bio->bi_end_io = pb->saved_bi_end_io; | 763 | |
| 764 | dm_unhook_bio(&pb->hook_info, bio); | ||
| 666 | 765 | ||
| 667 | if (err) { | 766 | if (err) { |
| 668 | bio_endio(bio, err); | 767 | bio_endio(bio, err); |
| @@ -693,9 +792,8 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, | |||
| 693 | 792 | ||
| 694 | pb->cache = cache; | 793 | pb->cache = cache; |
| 695 | pb->cblock = cblock; | 794 | pb->cblock = cblock; |
| 696 | pb->saved_bi_end_io = bio->bi_end_io; | 795 | dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); |
| 697 | dm_bio_record(&pb->bio_details, bio); | 796 | dm_bio_record(&pb->bio_details, bio); |
| 698 | bio->bi_end_io = writethrough_endio; | ||
| 699 | 797 | ||
| 700 | remap_to_origin_clear_discard(pb->cache, bio, oblock); | 798 | remap_to_origin_clear_discard(pb->cache, bio, oblock); |
| 701 | } | 799 | } |
| @@ -748,8 +846,9 @@ static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, | |||
| 748 | 846 | ||
| 749 | static void cleanup_migration(struct dm_cache_migration *mg) | 847 | static void cleanup_migration(struct dm_cache_migration *mg) |
| 750 | { | 848 | { |
| 751 | dec_nr_migrations(mg->cache); | 849 | struct cache *cache = mg->cache; |
| 752 | free_migration(mg); | 850 | free_migration(mg); |
| 851 | dec_nr_migrations(cache); | ||
| 753 | } | 852 | } |
| 754 | 853 | ||
| 755 | static void migration_failure(struct dm_cache_migration *mg) | 854 | static void migration_failure(struct dm_cache_migration *mg) |
| @@ -765,13 +864,13 @@ static void migration_failure(struct dm_cache_migration *mg) | |||
| 765 | DMWARN_LIMIT("demotion failed; couldn't copy block"); | 864 | DMWARN_LIMIT("demotion failed; couldn't copy block"); |
| 766 | policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); | 865 | policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); |
| 767 | 866 | ||
| 768 | cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); | 867 | cell_defer(cache, mg->old_ocell, mg->promote ? false : true); |
| 769 | if (mg->promote) | 868 | if (mg->promote) |
| 770 | cell_defer(cache, mg->new_ocell, 1); | 869 | cell_defer(cache, mg->new_ocell, true); |
| 771 | } else { | 870 | } else { |
| 772 | DMWARN_LIMIT("promotion failed; couldn't copy block"); | 871 | DMWARN_LIMIT("promotion failed; couldn't copy block"); |
| 773 | policy_remove_mapping(cache->policy, mg->new_oblock); | 872 | policy_remove_mapping(cache->policy, mg->new_oblock); |
| 774 | cell_defer(cache, mg->new_ocell, 1); | 873 | cell_defer(cache, mg->new_ocell, true); |
| 775 | } | 874 | } |
| 776 | 875 | ||
| 777 | cleanup_migration(mg); | 876 | cleanup_migration(mg); |
| @@ -823,7 +922,7 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) | |||
| 823 | return; | 922 | return; |
| 824 | 923 | ||
| 825 | } else if (mg->demote) { | 924 | } else if (mg->demote) { |
| 826 | cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); | 925 | cell_defer(cache, mg->old_ocell, mg->promote ? false : true); |
| 827 | 926 | ||
| 828 | if (mg->promote) { | 927 | if (mg->promote) { |
| 829 | mg->demote = false; | 928 | mg->demote = false; |
| @@ -832,11 +931,19 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) | |||
| 832 | list_add_tail(&mg->list, &cache->quiesced_migrations); | 931 | list_add_tail(&mg->list, &cache->quiesced_migrations); |
| 833 | spin_unlock_irqrestore(&cache->lock, flags); | 932 | spin_unlock_irqrestore(&cache->lock, flags); |
| 834 | 933 | ||
| 835 | } else | 934 | } else { |
| 935 | if (mg->invalidate) | ||
| 936 | policy_remove_mapping(cache->policy, mg->old_oblock); | ||
| 836 | cleanup_migration(mg); | 937 | cleanup_migration(mg); |
| 938 | } | ||
| 837 | 939 | ||
| 838 | } else { | 940 | } else { |
| 839 | cell_defer(cache, mg->new_ocell, true); | 941 | if (mg->requeue_holder) |
| 942 | cell_defer(cache, mg->new_ocell, true); | ||
| 943 | else { | ||
| 944 | bio_endio(mg->new_ocell->holder, 0); | ||
| 945 | cell_defer(cache, mg->new_ocell, false); | ||
| 946 | } | ||
| 840 | clear_dirty(cache, mg->new_oblock, mg->cblock); | 947 | clear_dirty(cache, mg->new_oblock, mg->cblock); |
| 841 | cleanup_migration(mg); | 948 | cleanup_migration(mg); |
| 842 | } | 949 | } |
| @@ -881,8 +988,46 @@ static void issue_copy_real(struct dm_cache_migration *mg) | |||
| 881 | r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); | 988 | r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); |
| 882 | } | 989 | } |
| 883 | 990 | ||
| 884 | if (r < 0) | 991 | if (r < 0) { |
| 992 | DMERR_LIMIT("issuing migration failed"); | ||
| 885 | migration_failure(mg); | 993 | migration_failure(mg); |
| 994 | } | ||
| 995 | } | ||
| 996 | |||
| 997 | static void overwrite_endio(struct bio *bio, int err) | ||
| 998 | { | ||
| 999 | struct dm_cache_migration *mg = bio->bi_private; | ||
| 1000 | struct cache *cache = mg->cache; | ||
| 1001 | size_t pb_data_size = get_per_bio_data_size(cache); | ||
| 1002 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); | ||
| 1003 | unsigned long flags; | ||
| 1004 | |||
| 1005 | if (err) | ||
| 1006 | mg->err = true; | ||
| 1007 | |||
| 1008 | spin_lock_irqsave(&cache->lock, flags); | ||
| 1009 | list_add_tail(&mg->list, &cache->completed_migrations); | ||
| 1010 | dm_unhook_bio(&pb->hook_info, bio); | ||
| 1011 | mg->requeue_holder = false; | ||
| 1012 | spin_unlock_irqrestore(&cache->lock, flags); | ||
| 1013 | |||
| 1014 | wake_worker(cache); | ||
| 1015 | } | ||
| 1016 | |||
| 1017 | static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) | ||
| 1018 | { | ||
| 1019 | size_t pb_data_size = get_per_bio_data_size(mg->cache); | ||
| 1020 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); | ||
| 1021 | |||
| 1022 | dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); | ||
| 1023 | remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); | ||
| 1024 | generic_make_request(bio); | ||
| 1025 | } | ||
| 1026 | |||
| 1027 | static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) | ||
| 1028 | { | ||
| 1029 | return (bio_data_dir(bio) == WRITE) && | ||
| 1030 | (bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); | ||
| 886 | } | 1031 | } |
| 887 | 1032 | ||
| 888 | static void avoid_copy(struct dm_cache_migration *mg) | 1033 | static void avoid_copy(struct dm_cache_migration *mg) |
| @@ -899,9 +1044,17 @@ static void issue_copy(struct dm_cache_migration *mg) | |||
| 899 | if (mg->writeback || mg->demote) | 1044 | if (mg->writeback || mg->demote) |
| 900 | avoid = !is_dirty(cache, mg->cblock) || | 1045 | avoid = !is_dirty(cache, mg->cblock) || |
| 901 | is_discarded_oblock(cache, mg->old_oblock); | 1046 | is_discarded_oblock(cache, mg->old_oblock); |
| 902 | else | 1047 | else { |
| 1048 | struct bio *bio = mg->new_ocell->holder; | ||
| 1049 | |||
| 903 | avoid = is_discarded_oblock(cache, mg->new_oblock); | 1050 | avoid = is_discarded_oblock(cache, mg->new_oblock); |
| 904 | 1051 | ||
| 1052 | if (!avoid && bio_writes_complete_block(cache, bio)) { | ||
| 1053 | issue_overwrite(mg, bio); | ||
| 1054 | return; | ||
| 1055 | } | ||
| 1056 | } | ||
| 1057 | |||
| 905 | avoid ? avoid_copy(mg) : issue_copy_real(mg); | 1058 | avoid ? avoid_copy(mg) : issue_copy_real(mg); |
| 906 | } | 1059 | } |
| 907 | 1060 | ||
| @@ -991,6 +1144,8 @@ static void promote(struct cache *cache, struct prealloc *structs, | |||
| 991 | mg->writeback = false; | 1144 | mg->writeback = false; |
| 992 | mg->demote = false; | 1145 | mg->demote = false; |
| 993 | mg->promote = true; | 1146 | mg->promote = true; |
| 1147 | mg->requeue_holder = true; | ||
| 1148 | mg->invalidate = false; | ||
| 994 | mg->cache = cache; | 1149 | mg->cache = cache; |
| 995 | mg->new_oblock = oblock; | 1150 | mg->new_oblock = oblock; |
| 996 | mg->cblock = cblock; | 1151 | mg->cblock = cblock; |
| @@ -1012,6 +1167,8 @@ static void writeback(struct cache *cache, struct prealloc *structs, | |||
| 1012 | mg->writeback = true; | 1167 | mg->writeback = true; |
| 1013 | mg->demote = false; | 1168 | mg->demote = false; |
| 1014 | mg->promote = false; | 1169 | mg->promote = false; |
| 1170 | mg->requeue_holder = true; | ||
| 1171 | mg->invalidate = false; | ||
| 1015 | mg->cache = cache; | 1172 | mg->cache = cache; |
| 1016 | mg->old_oblock = oblock; | 1173 | mg->old_oblock = oblock; |
| 1017 | mg->cblock = cblock; | 1174 | mg->cblock = cblock; |
| @@ -1035,6 +1192,8 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs, | |||
| 1035 | mg->writeback = false; | 1192 | mg->writeback = false; |
| 1036 | mg->demote = true; | 1193 | mg->demote = true; |
| 1037 | mg->promote = true; | 1194 | mg->promote = true; |
| 1195 | mg->requeue_holder = true; | ||
| 1196 | mg->invalidate = false; | ||
| 1038 | mg->cache = cache; | 1197 | mg->cache = cache; |
| 1039 | mg->old_oblock = old_oblock; | 1198 | mg->old_oblock = old_oblock; |
| 1040 | mg->new_oblock = new_oblock; | 1199 | mg->new_oblock = new_oblock; |
| @@ -1047,6 +1206,33 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs, | |||
| 1047 | quiesce_migration(mg); | 1206 | quiesce_migration(mg); |
| 1048 | } | 1207 | } |
| 1049 | 1208 | ||
| 1209 | /* | ||
| 1210 | * Invalidate a cache entry. No writeback occurs; any changes in the cache | ||
| 1211 | * block are thrown away. | ||
| 1212 | */ | ||
| 1213 | static void invalidate(struct cache *cache, struct prealloc *structs, | ||
| 1214 | dm_oblock_t oblock, dm_cblock_t cblock, | ||
| 1215 | struct dm_bio_prison_cell *cell) | ||
| 1216 | { | ||
| 1217 | struct dm_cache_migration *mg = prealloc_get_migration(structs); | ||
| 1218 | |||
| 1219 | mg->err = false; | ||
| 1220 | mg->writeback = false; | ||
| 1221 | mg->demote = true; | ||
| 1222 | mg->promote = false; | ||
| 1223 | mg->requeue_holder = true; | ||
| 1224 | mg->invalidate = true; | ||
| 1225 | mg->cache = cache; | ||
| 1226 | mg->old_oblock = oblock; | ||
| 1227 | mg->cblock = cblock; | ||
| 1228 | mg->old_ocell = cell; | ||
| 1229 | mg->new_ocell = NULL; | ||
| 1230 | mg->start_jiffies = jiffies; | ||
| 1231 | |||
| 1232 | inc_nr_migrations(cache); | ||
| 1233 | quiesce_migration(mg); | ||
| 1234 | } | ||
| 1235 | |||
| 1050 | /*---------------------------------------------------------------- | 1236 | /*---------------------------------------------------------------- |
| 1051 | * bio processing | 1237 | * bio processing |
| 1052 | *--------------------------------------------------------------*/ | 1238 | *--------------------------------------------------------------*/ |
| @@ -1109,13 +1295,6 @@ static bool spare_migration_bandwidth(struct cache *cache) | |||
| 1109 | return current_volume < cache->migration_threshold; | 1295 | return current_volume < cache->migration_threshold; |
| 1110 | } | 1296 | } |
| 1111 | 1297 | ||
| 1112 | static bool is_writethrough_io(struct cache *cache, struct bio *bio, | ||
| 1113 | dm_cblock_t cblock) | ||
| 1114 | { | ||
| 1115 | return bio_data_dir(bio) == WRITE && | ||
| 1116 | cache->features.write_through && !is_dirty(cache, cblock); | ||
| 1117 | } | ||
| 1118 | |||
| 1119 | static void inc_hit_counter(struct cache *cache, struct bio *bio) | 1298 | static void inc_hit_counter(struct cache *cache, struct bio *bio) |
| 1120 | { | 1299 | { |
| 1121 | atomic_inc(bio_data_dir(bio) == READ ? | 1300 | atomic_inc(bio_data_dir(bio) == READ ? |
| @@ -1128,6 +1307,15 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio) | |||
| 1128 | &cache->stats.read_miss : &cache->stats.write_miss); | 1307 | &cache->stats.read_miss : &cache->stats.write_miss); |
| 1129 | } | 1308 | } |
| 1130 | 1309 | ||
| 1310 | static void issue_cache_bio(struct cache *cache, struct bio *bio, | ||
| 1311 | struct per_bio_data *pb, | ||
| 1312 | dm_oblock_t oblock, dm_cblock_t cblock) | ||
| 1313 | { | ||
| 1314 | pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); | ||
| 1315 | remap_to_cache_dirty(cache, bio, oblock, cblock); | ||
| 1316 | issue(cache, bio); | ||
| 1317 | } | ||
| 1318 | |||
| 1131 | static void process_bio(struct cache *cache, struct prealloc *structs, | 1319 | static void process_bio(struct cache *cache, struct prealloc *structs, |
| 1132 | struct bio *bio) | 1320 | struct bio *bio) |
| 1133 | { | 1321 | { |
| @@ -1139,7 +1327,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, | |||
| 1139 | size_t pb_data_size = get_per_bio_data_size(cache); | 1327 | size_t pb_data_size = get_per_bio_data_size(cache); |
| 1140 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); | 1328 | struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); |
| 1141 | bool discarded_block = is_discarded_oblock(cache, block); | 1329 | bool discarded_block = is_discarded_oblock(cache, block); |
| 1142 | bool can_migrate = discarded_block || spare_migration_bandwidth(cache); | 1330 | bool passthrough = passthrough_mode(&cache->features); |
| 1331 | bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); | ||
| 1143 | 1332 | ||
| 1144 | /* | 1333 | /* |
| 1145 | * Check to see if that block is currently migrating. | 1334 | * Check to see if that block is currently migrating. |
| @@ -1160,15 +1349,39 @@ static void process_bio(struct cache *cache, struct prealloc *structs, | |||
| 1160 | 1349 | ||
| 1161 | switch (lookup_result.op) { | 1350 | switch (lookup_result.op) { |
| 1162 | case POLICY_HIT: | 1351 | case POLICY_HIT: |
| 1163 | inc_hit_counter(cache, bio); | 1352 | if (passthrough) { |
| 1164 | pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); | 1353 | inc_miss_counter(cache, bio); |
| 1165 | 1354 | ||
| 1166 | if (is_writethrough_io(cache, bio, lookup_result.cblock)) | 1355 | /* |
| 1167 | remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); | 1356 | * Passthrough always maps to the origin, |
| 1168 | else | 1357 | * invalidating any cache blocks that are written |
| 1169 | remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); | 1358 | * to. |
| 1359 | */ | ||
| 1360 | |||
| 1361 | if (bio_data_dir(bio) == WRITE) { | ||
| 1362 | atomic_inc(&cache->stats.demotion); | ||
| 1363 | invalidate(cache, structs, block, lookup_result.cblock, new_ocell); | ||
| 1364 | release_cell = false; | ||
| 1365 | |||
| 1366 | } else { | ||
| 1367 | /* FIXME: factor out issue_origin() */ | ||
| 1368 | pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); | ||
| 1369 | remap_to_origin_clear_discard(cache, bio, block); | ||
| 1370 | issue(cache, bio); | ||
| 1371 | } | ||
| 1372 | } else { | ||
| 1373 | inc_hit_counter(cache, bio); | ||
| 1374 | |||
| 1375 | if (bio_data_dir(bio) == WRITE && | ||
| 1376 | writethrough_mode(&cache->features) && | ||
| 1377 | !is_dirty(cache, lookup_result.cblock)) { | ||
| 1378 | pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); | ||
| 1379 | remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); | ||
| 1380 | issue(cache, bio); | ||
| 1381 | } else | ||
| 1382 | issue_cache_bio(cache, bio, pb, block, lookup_result.cblock); | ||
| 1383 | } | ||
| 1170 | 1384 | ||
| 1171 | issue(cache, bio); | ||
| 1172 | break; | 1385 | break; |
| 1173 | 1386 | ||
| 1174 | case POLICY_MISS: | 1387 | case POLICY_MISS: |
| @@ -1227,15 +1440,17 @@ static int need_commit_due_to_time(struct cache *cache) | |||
| 1227 | 1440 | ||
| 1228 | static int commit_if_needed(struct cache *cache) | 1441 | static int commit_if_needed(struct cache *cache) |
| 1229 | { | 1442 | { |
| 1230 | if (dm_cache_changed_this_transaction(cache->cmd) && | 1443 | int r = 0; |
| 1231 | (cache->commit_requested || need_commit_due_to_time(cache))) { | 1444 | |
| 1445 | if ((cache->commit_requested || need_commit_due_to_time(cache)) && | ||
| 1446 | dm_cache_changed_this_transaction(cache->cmd)) { | ||
| 1232 | atomic_inc(&cache->stats.commit_count); | 1447 | atomic_inc(&cache->stats.commit_count); |
| 1233 | cache->last_commit_jiffies = jiffies; | ||
| 1234 | cache->commit_requested = false; | 1448 | cache->commit_requested = false; |
| 1235 | return dm_cache_commit(cache->cmd, false); | 1449 | r = dm_cache_commit(cache->cmd, false); |
| 1450 | cache->last_commit_jiffies = jiffies; | ||
| 1236 | } | 1451 | } |
| 1237 | 1452 | ||
| 1238 | return 0; | 1453 | return r; |
| 1239 | } | 1454 | } |
| 1240 | 1455 | ||
| 1241 | static void process_deferred_bios(struct cache *cache) | 1456 | static void process_deferred_bios(struct cache *cache) |
| @@ -1344,36 +1559,88 @@ static void writeback_some_dirty_blocks(struct cache *cache) | |||
| 1344 | } | 1559 | } |
| 1345 | 1560 | ||
| 1346 | /*---------------------------------------------------------------- | 1561 | /*---------------------------------------------------------------- |
| 1347 | * Main worker loop | 1562 | * Invalidations. |
| 1563 | * Dropping something from the cache *without* writing back. | ||
| 1348 | *--------------------------------------------------------------*/ | 1564 | *--------------------------------------------------------------*/ |
| 1349 | static void start_quiescing(struct cache *cache) | 1565 | |
| 1566 | static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) | ||
| 1350 | { | 1567 | { |
| 1351 | unsigned long flags; | 1568 | int r = 0; |
| 1569 | uint64_t begin = from_cblock(req->cblocks->begin); | ||
| 1570 | uint64_t end = from_cblock(req->cblocks->end); | ||
| 1352 | 1571 | ||
| 1353 | spin_lock_irqsave(&cache->lock, flags); | 1572 | while (begin != end) { |
| 1354 | cache->quiescing = 1; | 1573 | r = policy_remove_cblock(cache->policy, to_cblock(begin)); |
| 1355 | spin_unlock_irqrestore(&cache->lock, flags); | 1574 | if (!r) { |
| 1575 | r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); | ||
| 1576 | if (r) | ||
| 1577 | break; | ||
| 1578 | |||
| 1579 | } else if (r == -ENODATA) { | ||
| 1580 | /* harmless, already unmapped */ | ||
| 1581 | r = 0; | ||
| 1582 | |||
| 1583 | } else { | ||
| 1584 | DMERR("policy_remove_cblock failed"); | ||
| 1585 | break; | ||
| 1586 | } | ||
| 1587 | |||
| 1588 | begin++; | ||
| 1589 | } | ||
| 1590 | |||
| 1591 | cache->commit_requested = true; | ||
| 1592 | |||
| 1593 | req->err = r; | ||
| 1594 | atomic_set(&req->complete, 1); | ||
| 1595 | |||
| 1596 | wake_up(&req->result_wait); | ||
| 1356 | } | 1597 | } |
| 1357 | 1598 | ||
| 1358 | static void stop_quiescing(struct cache *cache) | 1599 | static void process_invalidation_requests(struct cache *cache) |
| 1359 | { | 1600 | { |
| 1360 | unsigned long flags; | 1601 | struct list_head list; |
| 1602 | struct invalidation_request *req, *tmp; | ||
| 1361 | 1603 | ||
| 1362 | spin_lock_irqsave(&cache->lock, flags); | 1604 | INIT_LIST_HEAD(&list); |
| 1363 | cache->quiescing = 0; | 1605 | spin_lock(&cache->invalidation_lock); |
| 1364 | spin_unlock_irqrestore(&cache->lock, flags); | 1606 | list_splice_init(&cache->invalidation_requests, &list); |
| 1607 | spin_unlock(&cache->invalidation_lock); | ||
| 1608 | |||
| 1609 | list_for_each_entry_safe (req, tmp, &list, list) | ||
| 1610 | process_invalidation_request(cache, req); | ||
| 1365 | } | 1611 | } |
| 1366 | 1612 | ||
| 1613 | /*---------------------------------------------------------------- | ||
| 1614 | * Main worker loop | ||
| 1615 | *--------------------------------------------------------------*/ | ||
| 1367 | static bool is_quiescing(struct cache *cache) | 1616 | static bool is_quiescing(struct cache *cache) |
| 1368 | { | 1617 | { |
| 1369 | int r; | 1618 | return atomic_read(&cache->quiescing); |
| 1370 | unsigned long flags; | 1619 | } |
| 1371 | 1620 | ||
| 1372 | spin_lock_irqsave(&cache->lock, flags); | 1621 | static void ack_quiescing(struct cache *cache) |
| 1373 | r = cache->quiescing; | 1622 | { |
| 1374 | spin_unlock_irqrestore(&cache->lock, flags); | 1623 | if (is_quiescing(cache)) { |
| 1624 | atomic_inc(&cache->quiescing_ack); | ||
| 1625 | wake_up(&cache->quiescing_wait); | ||
| 1626 | } | ||
| 1627 | } | ||
| 1375 | 1628 | ||
| 1376 | return r; | 1629 | static void wait_for_quiescing_ack(struct cache *cache) |
| 1630 | { | ||
| 1631 | wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); | ||
| 1632 | } | ||
| 1633 | |||
| 1634 | static void start_quiescing(struct cache *cache) | ||
| 1635 | { | ||
| 1636 | atomic_inc(&cache->quiescing); | ||
| 1637 | wait_for_quiescing_ack(cache); | ||
| 1638 | } | ||
| 1639 | |||
| 1640 | static void stop_quiescing(struct cache *cache) | ||
| 1641 | { | ||
| 1642 | atomic_set(&cache->quiescing, 0); | ||
| 1643 | atomic_set(&cache->quiescing_ack, 0); | ||
| 1377 | } | 1644 | } |
| 1378 | 1645 | ||
| 1379 | static void wait_for_migrations(struct cache *cache) | 1646 | static void wait_for_migrations(struct cache *cache) |
| @@ -1412,7 +1679,8 @@ static int more_work(struct cache *cache) | |||
| 1412 | !bio_list_empty(&cache->deferred_writethrough_bios) || | 1679 | !bio_list_empty(&cache->deferred_writethrough_bios) || |
| 1413 | !list_empty(&cache->quiesced_migrations) || | 1680 | !list_empty(&cache->quiesced_migrations) || |
| 1414 | !list_empty(&cache->completed_migrations) || | 1681 | !list_empty(&cache->completed_migrations) || |
| 1415 | !list_empty(&cache->need_commit_migrations); | 1682 | !list_empty(&cache->need_commit_migrations) || |
| 1683 | cache->invalidate; | ||
| 1416 | } | 1684 | } |
| 1417 | 1685 | ||
| 1418 | static void do_worker(struct work_struct *ws) | 1686 | static void do_worker(struct work_struct *ws) |
| @@ -1420,16 +1688,16 @@ static void do_worker(struct work_struct *ws) | |||
| 1420 | struct cache *cache = container_of(ws, struct cache, worker); | 1688 | struct cache *cache = container_of(ws, struct cache, worker); |
| 1421 | 1689 | ||
| 1422 | do { | 1690 | do { |
| 1423 | if (!is_quiescing(cache)) | 1691 | if (!is_quiescing(cache)) { |
| 1692 | writeback_some_dirty_blocks(cache); | ||
| 1693 | process_deferred_writethrough_bios(cache); | ||
| 1424 | process_deferred_bios(cache); | 1694 | process_deferred_bios(cache); |
| 1695 | process_invalidation_requests(cache); | ||
| 1696 | } | ||
| 1425 | 1697 | ||
| 1426 | process_migrations(cache, &cache->quiesced_migrations, issue_copy); | 1698 | process_migrations(cache, &cache->quiesced_migrations, issue_copy); |
| 1427 | process_migrations(cache, &cache->completed_migrations, complete_migration); | 1699 | process_migrations(cache, &cache->completed_migrations, complete_migration); |
| 1428 | 1700 | ||
| 1429 | writeback_some_dirty_blocks(cache); | ||
| 1430 | |||
| 1431 | process_deferred_writethrough_bios(cache); | ||
| 1432 | |||
| 1433 | if (commit_if_needed(cache)) { | 1701 | if (commit_if_needed(cache)) { |
| 1434 | process_deferred_flush_bios(cache, false); | 1702 | process_deferred_flush_bios(cache, false); |
| 1435 | 1703 | ||
| @@ -1442,6 +1710,9 @@ static void do_worker(struct work_struct *ws) | |||
| 1442 | process_migrations(cache, &cache->need_commit_migrations, | 1710 | process_migrations(cache, &cache->need_commit_migrations, |
| 1443 | migration_success_post_commit); | 1711 | migration_success_post_commit); |
| 1444 | } | 1712 | } |
| 1713 | |||
| 1714 | ack_quiescing(cache); | ||
| 1715 | |||
| 1445 | } while (more_work(cache)); | 1716 | } while (more_work(cache)); |
| 1446 | } | 1717 | } |
| 1447 | 1718 | ||
| @@ -1715,7 +1986,7 @@ static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, | |||
| 1715 | static void init_features(struct cache_features *cf) | 1986 | static void init_features(struct cache_features *cf) |
| 1716 | { | 1987 | { |
| 1717 | cf->mode = CM_WRITE; | 1988 | cf->mode = CM_WRITE; |
| 1718 | cf->write_through = false; | 1989 | cf->io_mode = CM_IO_WRITEBACK; |
| 1719 | } | 1990 | } |
| 1720 | 1991 | ||
| 1721 | static int parse_features(struct cache_args *ca, struct dm_arg_set *as, | 1992 | static int parse_features(struct cache_args *ca, struct dm_arg_set *as, |
| @@ -1740,10 +2011,13 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as, | |||
| 1740 | arg = dm_shift_arg(as); | 2011 | arg = dm_shift_arg(as); |
| 1741 | 2012 | ||
| 1742 | if (!strcasecmp(arg, "writeback")) | 2013 | if (!strcasecmp(arg, "writeback")) |
| 1743 | cf->write_through = false; | 2014 | cf->io_mode = CM_IO_WRITEBACK; |
| 1744 | 2015 | ||
| 1745 | else if (!strcasecmp(arg, "writethrough")) | 2016 | else if (!strcasecmp(arg, "writethrough")) |
| 1746 | cf->write_through = true; | 2017 | cf->io_mode = CM_IO_WRITETHROUGH; |
| 2018 | |||
| 2019 | else if (!strcasecmp(arg, "passthrough")) | ||
| 2020 | cf->io_mode = CM_IO_PASSTHROUGH; | ||
| 1747 | 2021 | ||
| 1748 | else { | 2022 | else { |
| 1749 | *error = "Unrecognised cache feature requested"; | 2023 | *error = "Unrecognised cache feature requested"; |
| @@ -1872,14 +2146,15 @@ static int set_config_values(struct cache *cache, int argc, const char **argv) | |||
| 1872 | static int create_cache_policy(struct cache *cache, struct cache_args *ca, | 2146 | static int create_cache_policy(struct cache *cache, struct cache_args *ca, |
| 1873 | char **error) | 2147 | char **error) |
| 1874 | { | 2148 | { |
| 1875 | cache->policy = dm_cache_policy_create(ca->policy_name, | 2149 | struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, |
| 1876 | cache->cache_size, | 2150 | cache->cache_size, |
| 1877 | cache->origin_sectors, | 2151 | cache->origin_sectors, |
| 1878 | cache->sectors_per_block); | 2152 | cache->sectors_per_block); |
| 1879 | if (!cache->policy) { | 2153 | if (IS_ERR(p)) { |
| 1880 | *error = "Error creating cache's policy"; | 2154 | *error = "Error creating cache's policy"; |
| 1881 | return -ENOMEM; | 2155 | return PTR_ERR(p); |
| 1882 | } | 2156 | } |
| 2157 | cache->policy = p; | ||
| 1883 | 2158 | ||
| 1884 | return 0; | 2159 | return 0; |
| 1885 | } | 2160 | } |
| @@ -1995,6 +2270,22 @@ static int cache_create(struct cache_args *ca, struct cache **result) | |||
| 1995 | } | 2270 | } |
| 1996 | cache->cmd = cmd; | 2271 | cache->cmd = cmd; |
| 1997 | 2272 | ||
| 2273 | if (passthrough_mode(&cache->features)) { | ||
| 2274 | bool all_clean; | ||
| 2275 | |||
| 2276 | r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); | ||
| 2277 | if (r) { | ||
| 2278 | *error = "dm_cache_metadata_all_clean() failed"; | ||
| 2279 | goto bad; | ||
| 2280 | } | ||
| 2281 | |||
| 2282 | if (!all_clean) { | ||
| 2283 | *error = "Cannot enter passthrough mode unless all blocks are clean"; | ||
| 2284 | r = -EINVAL; | ||
| 2285 | goto bad; | ||
| 2286 | } | ||
| 2287 | } | ||
| 2288 | |||
| 1998 | spin_lock_init(&cache->lock); | 2289 | spin_lock_init(&cache->lock); |
| 1999 | bio_list_init(&cache->deferred_bios); | 2290 | bio_list_init(&cache->deferred_bios); |
| 2000 | bio_list_init(&cache->deferred_flush_bios); | 2291 | bio_list_init(&cache->deferred_flush_bios); |
| @@ -2005,6 +2296,10 @@ static int cache_create(struct cache_args *ca, struct cache **result) | |||
| 2005 | atomic_set(&cache->nr_migrations, 0); | 2296 | atomic_set(&cache->nr_migrations, 0); |
| 2006 | init_waitqueue_head(&cache->migration_wait); | 2297 | init_waitqueue_head(&cache->migration_wait); |
| 2007 | 2298 | ||
| 2299 | init_waitqueue_head(&cache->quiescing_wait); | ||
| 2300 | atomic_set(&cache->quiescing, 0); | ||
| 2301 | atomic_set(&cache->quiescing_ack, 0); | ||
| 2302 | |||
| 2008 | r = -ENOMEM; | 2303 | r = -ENOMEM; |
| 2009 | cache->nr_dirty = 0; | 2304 | cache->nr_dirty = 0; |
| 2010 | cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); | 2305 | cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); |
| @@ -2064,7 +2359,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) | |||
| 2064 | 2359 | ||
| 2065 | cache->need_tick_bio = true; | 2360 | cache->need_tick_bio = true; |
| 2066 | cache->sized = false; | 2361 | cache->sized = false; |
| 2067 | cache->quiescing = false; | 2362 | cache->invalidate = false; |
| 2068 | cache->commit_requested = false; | 2363 | cache->commit_requested = false; |
| 2069 | cache->loaded_mappings = false; | 2364 | cache->loaded_mappings = false; |
| 2070 | cache->loaded_discards = false; | 2365 | cache->loaded_discards = false; |
| @@ -2078,6 +2373,9 @@ static int cache_create(struct cache_args *ca, struct cache **result) | |||
| 2078 | atomic_set(&cache->stats.commit_count, 0); | 2373 | atomic_set(&cache->stats.commit_count, 0); |
| 2079 | atomic_set(&cache->stats.discard_count, 0); | 2374 | atomic_set(&cache->stats.discard_count, 0); |
| 2080 | 2375 | ||
| 2376 | spin_lock_init(&cache->invalidation_lock); | ||
| 2377 | INIT_LIST_HEAD(&cache->invalidation_requests); | ||
| 2378 | |||
| 2081 | *result = cache; | 2379 | *result = cache; |
| 2082 | return 0; | 2380 | return 0; |
| 2083 | 2381 | ||
| @@ -2207,17 +2505,37 @@ static int cache_map(struct dm_target *ti, struct bio *bio) | |||
| 2207 | return DM_MAPIO_SUBMITTED; | 2505 | return DM_MAPIO_SUBMITTED; |
| 2208 | } | 2506 | } |
| 2209 | 2507 | ||
| 2508 | r = DM_MAPIO_REMAPPED; | ||
| 2210 | switch (lookup_result.op) { | 2509 | switch (lookup_result.op) { |
| 2211 | case POLICY_HIT: | 2510 | case POLICY_HIT: |
| 2212 | inc_hit_counter(cache, bio); | 2511 | if (passthrough_mode(&cache->features)) { |
| 2213 | pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); | 2512 | if (bio_data_dir(bio) == WRITE) { |
| 2513 | /* | ||
| 2514 | * We need to invalidate this block, so | ||
| 2515 | * defer for the worker thread. | ||
| 2516 | */ | ||
| 2517 | cell_defer(cache, cell, true); | ||
| 2518 | r = DM_MAPIO_SUBMITTED; | ||
| 2519 | |||
| 2520 | } else { | ||
| 2521 | pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); | ||
| 2522 | inc_miss_counter(cache, bio); | ||
| 2523 | remap_to_origin_clear_discard(cache, bio, block); | ||
| 2524 | |||
| 2525 | cell_defer(cache, cell, false); | ||
| 2526 | } | ||
| 2214 | 2527 | ||
| 2215 | if (is_writethrough_io(cache, bio, lookup_result.cblock)) | 2528 | } else { |
| 2216 | remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); | 2529 | inc_hit_counter(cache, bio); |
| 2217 | else | ||
| 2218 | remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); | ||
| 2219 | 2530 | ||
| 2220 | cell_defer(cache, cell, false); | 2531 | if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && |
| 2532 | !is_dirty(cache, lookup_result.cblock)) | ||
| 2533 | remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); | ||
| 2534 | else | ||
| 2535 | remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); | ||
| 2536 | |||
| 2537 | cell_defer(cache, cell, false); | ||
| 2538 | } | ||
| 2221 | break; | 2539 | break; |
| 2222 | 2540 | ||
| 2223 | case POLICY_MISS: | 2541 | case POLICY_MISS: |
| @@ -2242,10 +2560,10 @@ static int cache_map(struct dm_target *ti, struct bio *bio) | |||
| 2242 | DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, | 2560 | DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, |
| 2243 | (unsigned) lookup_result.op); | 2561 | (unsigned) lookup_result.op); |
| 2244 | bio_io_error(bio); | 2562 | bio_io_error(bio); |
| 2245 | return DM_MAPIO_SUBMITTED; | 2563 | r = DM_MAPIO_SUBMITTED; |
| 2246 | } | 2564 | } |
| 2247 | 2565 | ||
| 2248 | return DM_MAPIO_REMAPPED; | 2566 | return r; |
| 2249 | } | 2567 | } |
| 2250 | 2568 | ||
| 2251 | static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) | 2569 | static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) |
| @@ -2406,26 +2724,71 @@ static int load_discard(void *context, sector_t discard_block_size, | |||
| 2406 | return 0; | 2724 | return 0; |
| 2407 | } | 2725 | } |
| 2408 | 2726 | ||
| 2727 | static dm_cblock_t get_cache_dev_size(struct cache *cache) | ||
| 2728 | { | ||
| 2729 | sector_t size = get_dev_size(cache->cache_dev); | ||
| 2730 | (void) sector_div(size, cache->sectors_per_block); | ||
| 2731 | return to_cblock(size); | ||
| 2732 | } | ||
| 2733 | |||
| 2734 | static bool can_resize(struct cache *cache, dm_cblock_t new_size) | ||
| 2735 | { | ||
| 2736 | if (from_cblock(new_size) > from_cblock(cache->cache_size)) | ||
| 2737 | return true; | ||
| 2738 | |||
| 2739 | /* | ||
| 2740 | * We can't drop a dirty block when shrinking the cache. | ||
| 2741 | */ | ||
| 2742 | while (from_cblock(new_size) < from_cblock(cache->cache_size)) { | ||
| 2743 | new_size = to_cblock(from_cblock(new_size) + 1); | ||
| 2744 | if (is_dirty(cache, new_size)) { | ||
| 2745 | DMERR("unable to shrink cache; cache block %llu is dirty", | ||
| 2746 | (unsigned long long) from_cblock(new_size)); | ||
| 2747 | return false; | ||
| 2748 | } | ||
| 2749 | } | ||
| 2750 | |||
| 2751 | return true; | ||
| 2752 | } | ||
| 2753 | |||
| 2754 | static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) | ||
| 2755 | { | ||
| 2756 | int r; | ||
| 2757 | |||
| 2758 | r = dm_cache_resize(cache->cmd, cache->cache_size); | ||
| 2759 | if (r) { | ||
| 2760 | DMERR("could not resize cache metadata"); | ||
| 2761 | return r; | ||
| 2762 | } | ||
| 2763 | |||
| 2764 | cache->cache_size = new_size; | ||
| 2765 | |||
| 2766 | return 0; | ||
| 2767 | } | ||
| 2768 | |||
| 2409 | static int cache_preresume(struct dm_target *ti) | 2769 | static int cache_preresume(struct dm_target *ti) |
| 2410 | { | 2770 | { |
| 2411 | int r = 0; | 2771 | int r = 0; |
| 2412 | struct cache *cache = ti->private; | 2772 | struct cache *cache = ti->private; |
| 2413 | sector_t actual_cache_size = get_dev_size(cache->cache_dev); | 2773 | dm_cblock_t csize = get_cache_dev_size(cache); |
| 2414 | (void) sector_div(actual_cache_size, cache->sectors_per_block); | ||
| 2415 | 2774 | ||
| 2416 | /* | 2775 | /* |
| 2417 | * Check to see if the cache has resized. | 2776 | * Check to see if the cache has resized. |
| 2418 | */ | 2777 | */ |
| 2419 | if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) { | 2778 | if (!cache->sized) { |
| 2420 | cache->cache_size = to_cblock(actual_cache_size); | 2779 | r = resize_cache_dev(cache, csize); |
| 2421 | 2780 | if (r) | |
| 2422 | r = dm_cache_resize(cache->cmd, cache->cache_size); | ||
| 2423 | if (r) { | ||
| 2424 | DMERR("could not resize cache metadata"); | ||
| 2425 | return r; | 2781 | return r; |
| 2426 | } | ||
| 2427 | 2782 | ||
| 2428 | cache->sized = true; | 2783 | cache->sized = true; |
| 2784 | |||
| 2785 | } else if (csize != cache->cache_size) { | ||
| 2786 | if (!can_resize(cache, csize)) | ||
| 2787 | return -EINVAL; | ||
| 2788 | |||
| 2789 | r = resize_cache_dev(cache, csize); | ||
| 2790 | if (r) | ||
| 2791 | return r; | ||
| 2429 | } | 2792 | } |
| 2430 | 2793 | ||
| 2431 | if (!cache->loaded_mappings) { | 2794 | if (!cache->loaded_mappings) { |
| @@ -2518,10 +2881,19 @@ static void cache_status(struct dm_target *ti, status_type_t type, | |||
| 2518 | (unsigned long long) from_cblock(residency), | 2881 | (unsigned long long) from_cblock(residency), |
| 2519 | cache->nr_dirty); | 2882 | cache->nr_dirty); |
| 2520 | 2883 | ||
| 2521 | if (cache->features.write_through) | 2884 | if (writethrough_mode(&cache->features)) |
| 2522 | DMEMIT("1 writethrough "); | 2885 | DMEMIT("1 writethrough "); |
| 2523 | else | 2886 | |
| 2524 | DMEMIT("0 "); | 2887 | else if (passthrough_mode(&cache->features)) |
| 2888 | DMEMIT("1 passthrough "); | ||
| 2889 | |||
| 2890 | else if (writeback_mode(&cache->features)) | ||
| 2891 | DMEMIT("1 writeback "); | ||
| 2892 | |||
| 2893 | else { | ||
| 2894 | DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode); | ||
| 2895 | goto err; | ||
| 2896 | } | ||
| 2525 | 2897 | ||
| 2526 | DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); | 2898 | DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); |
| 2527 | if (sz < maxlen) { | 2899 | if (sz < maxlen) { |
| @@ -2553,7 +2925,128 @@ err: | |||
| 2553 | } | 2925 | } |
| 2554 | 2926 | ||
| 2555 | /* | 2927 | /* |
| 2556 | * Supports <key> <value>. | 2928 | * A cache block range can take two forms: |
| 2929 | * | ||
| 2930 | * i) A single cblock, eg. '3456' | ||
| 2931 | * ii) A begin and end cblock with dots between, eg. 123-234 | ||
| 2932 | */ | ||
| 2933 | static int parse_cblock_range(struct cache *cache, const char *str, | ||
| 2934 | struct cblock_range *result) | ||
| 2935 | { | ||
| 2936 | char dummy; | ||
| 2937 | uint64_t b, e; | ||
| 2938 | int r; | ||
| 2939 | |||
| 2940 | /* | ||
| 2941 | * Try and parse form (ii) first. | ||
| 2942 | */ | ||
| 2943 | r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); | ||
| 2944 | if (r < 0) | ||
| 2945 | return r; | ||
| 2946 | |||
| 2947 | if (r == 2) { | ||
| 2948 | result->begin = to_cblock(b); | ||
| 2949 | result->end = to_cblock(e); | ||
| 2950 | return 0; | ||
| 2951 | } | ||
| 2952 | |||
| 2953 | /* | ||
| 2954 | * That didn't work, try form (i). | ||
| 2955 | */ | ||
| 2956 | r = sscanf(str, "%llu%c", &b, &dummy); | ||
| 2957 | if (r < 0) | ||
| 2958 | return r; | ||
| 2959 | |||
| 2960 | if (r == 1) { | ||
| 2961 | result->begin = to_cblock(b); | ||
| 2962 | result->end = to_cblock(from_cblock(result->begin) + 1u); | ||
| 2963 | return 0; | ||
| 2964 | } | ||
| 2965 | |||
| 2966 | DMERR("invalid cblock range '%s'", str); | ||
| 2967 | return -EINVAL; | ||
| 2968 | } | ||
| 2969 | |||
| 2970 | static int validate_cblock_range(struct cache *cache, struct cblock_range *range) | ||
| 2971 | { | ||
| 2972 | uint64_t b = from_cblock(range->begin); | ||
| 2973 | uint64_t e = from_cblock(range->end); | ||
| 2974 | uint64_t n = from_cblock(cache->cache_size); | ||
| 2975 | |||
| 2976 | if (b >= n) { | ||
| 2977 | DMERR("begin cblock out of range: %llu >= %llu", b, n); | ||
| 2978 | return -EINVAL; | ||
| 2979 | } | ||
| 2980 | |||
| 2981 | if (e > n) { | ||
| 2982 | DMERR("end cblock out of range: %llu > %llu", e, n); | ||
| 2983 | return -EINVAL; | ||
| 2984 | } | ||
| 2985 | |||
| 2986 | if (b >= e) { | ||
| 2987 | DMERR("invalid cblock range: %llu >= %llu", b, e); | ||
| 2988 | return -EINVAL; | ||
| 2989 | } | ||
| 2990 | |||
| 2991 | return 0; | ||
| 2992 | } | ||
| 2993 | |||
| 2994 | static int request_invalidation(struct cache *cache, struct cblock_range *range) | ||
| 2995 | { | ||
| 2996 | struct invalidation_request req; | ||
| 2997 | |||
| 2998 | INIT_LIST_HEAD(&req.list); | ||
| 2999 | req.cblocks = range; | ||
| 3000 | atomic_set(&req.complete, 0); | ||
| 3001 | req.err = 0; | ||
| 3002 | init_waitqueue_head(&req.result_wait); | ||
| 3003 | |||
| 3004 | spin_lock(&cache->invalidation_lock); | ||
| 3005 | list_add(&req.list, &cache->invalidation_requests); | ||
| 3006 | spin_unlock(&cache->invalidation_lock); | ||
| 3007 | wake_worker(cache); | ||
| 3008 | |||
| 3009 | wait_event(req.result_wait, atomic_read(&req.complete)); | ||
| 3010 | return req.err; | ||
| 3011 | } | ||
| 3012 | |||
| 3013 | static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, | ||
| 3014 | const char **cblock_ranges) | ||
| 3015 | { | ||
| 3016 | int r = 0; | ||
| 3017 | unsigned i; | ||
| 3018 | struct cblock_range range; | ||
| 3019 | |||
| 3020 | if (!passthrough_mode(&cache->features)) { | ||
| 3021 | DMERR("cache has to be in passthrough mode for invalidation"); | ||
| 3022 | return -EPERM; | ||
| 3023 | } | ||
| 3024 | |||
| 3025 | for (i = 0; i < count; i++) { | ||
| 3026 | r = parse_cblock_range(cache, cblock_ranges[i], &range); | ||
| 3027 | if (r) | ||
| 3028 | break; | ||
| 3029 | |||
| 3030 | r = validate_cblock_range(cache, &range); | ||
| 3031 | if (r) | ||
| 3032 | break; | ||
| 3033 | |||
| 3034 | /* | ||
| 3035 | * Pass begin and end origin blocks to the worker and wake it. | ||
| 3036 | */ | ||
| 3037 | r = request_invalidation(cache, &range); | ||
| 3038 | if (r) | ||
| 3039 | break; | ||
| 3040 | } | ||
| 3041 | |||
| 3042 | return r; | ||
| 3043 | } | ||
| 3044 | |||
| 3045 | /* | ||
| 3046 | * Supports | ||
| 3047 | * "<key> <value>" | ||
| 3048 | * and | ||
| 3049 | * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* | ||
| 2557 | * | 3050 | * |
| 2558 | * The key migration_threshold is supported by the cache target core. | 3051 | * The key migration_threshold is supported by the cache target core. |
| 2559 | */ | 3052 | */ |
| @@ -2561,6 +3054,12 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv) | |||
| 2561 | { | 3054 | { |
| 2562 | struct cache *cache = ti->private; | 3055 | struct cache *cache = ti->private; |
| 2563 | 3056 | ||
| 3057 | if (!argc) | ||
| 3058 | return -EINVAL; | ||
| 3059 | |||
| 3060 | if (!strcasecmp(argv[0], "invalidate_cblocks")) | ||
| 3061 | return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); | ||
| 3062 | |||
| 2564 | if (argc != 2) | 3063 | if (argc != 2) |
| 2565 | return -EINVAL; | 3064 | return -EINVAL; |
| 2566 | 3065 | ||
| @@ -2630,7 +3129,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) | |||
| 2630 | 3129 | ||
| 2631 | static struct target_type cache_target = { | 3130 | static struct target_type cache_target = { |
| 2632 | .name = "cache", | 3131 | .name = "cache", |
| 2633 | .version = {1, 1, 1}, | 3132 | .version = {1, 2, 0}, |
| 2634 | .module = THIS_MODULE, | 3133 | .module = THIS_MODULE, |
| 2635 | .ctr = cache_ctr, | 3134 | .ctr = cache_ctr, |
| 2636 | .dtr = cache_dtr, | 3135 | .dtr = cache_dtr, |
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 0fce0bc1a957..50ea7ed24dce 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
| @@ -2,6 +2,7 @@ | |||
| 2 | * Copyright (C) 2003 Christophe Saout <christophe@saout.de> | 2 | * Copyright (C) 2003 Christophe Saout <christophe@saout.de> |
| 3 | * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> | 3 | * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> |
| 4 | * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. | 4 | * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. |
| 5 | * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com> | ||
| 5 | * | 6 | * |
| 6 | * This file is released under the GPL. | 7 | * This file is released under the GPL. |
| 7 | */ | 8 | */ |
| @@ -98,6 +99,13 @@ struct iv_lmk_private { | |||
| 98 | u8 *seed; | 99 | u8 *seed; |
| 99 | }; | 100 | }; |
| 100 | 101 | ||
| 102 | #define TCW_WHITENING_SIZE 16 | ||
| 103 | struct iv_tcw_private { | ||
| 104 | struct crypto_shash *crc32_tfm; | ||
| 105 | u8 *iv_seed; | ||
| 106 | u8 *whitening; | ||
| 107 | }; | ||
| 108 | |||
| 101 | /* | 109 | /* |
| 102 | * Crypt: maps a linear range of a block device | 110 | * Crypt: maps a linear range of a block device |
| 103 | * and encrypts / decrypts at the same time. | 111 | * and encrypts / decrypts at the same time. |
| @@ -139,6 +147,7 @@ struct crypt_config { | |||
| 139 | struct iv_essiv_private essiv; | 147 | struct iv_essiv_private essiv; |
| 140 | struct iv_benbi_private benbi; | 148 | struct iv_benbi_private benbi; |
| 141 | struct iv_lmk_private lmk; | 149 | struct iv_lmk_private lmk; |
| 150 | struct iv_tcw_private tcw; | ||
| 142 | } iv_gen_private; | 151 | } iv_gen_private; |
| 143 | sector_t iv_offset; | 152 | sector_t iv_offset; |
| 144 | unsigned int iv_size; | 153 | unsigned int iv_size; |
| @@ -171,7 +180,8 @@ struct crypt_config { | |||
| 171 | 180 | ||
| 172 | unsigned long flags; | 181 | unsigned long flags; |
| 173 | unsigned int key_size; | 182 | unsigned int key_size; |
| 174 | unsigned int key_parts; | 183 | unsigned int key_parts; /* independent parts in key buffer */ |
| 184 | unsigned int key_extra_size; /* additional keys length */ | ||
| 175 | u8 key[0]; | 185 | u8 key[0]; |
| 176 | }; | 186 | }; |
| 177 | 187 | ||
| @@ -230,6 +240,16 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) | |||
| 230 | * version 3: the same as version 2 with additional IV seed | 240 | * version 3: the same as version 2 with additional IV seed |
| 231 | * (it uses 65 keys, last key is used as IV seed) | 241 | * (it uses 65 keys, last key is used as IV seed) |
| 232 | * | 242 | * |
| 243 | * tcw: Compatible implementation of the block chaining mode used | ||
| 244 | * by the TrueCrypt device encryption system (prior to version 4.1). | ||
| 245 | * For more info see: http://www.truecrypt.org | ||
| 246 | * It operates on full 512 byte sectors and uses CBC | ||
| 247 | * with an IV derived from initial key and the sector number. | ||
| 248 | * In addition, whitening value is applied on every sector, whitening | ||
| 249 | * is calculated from initial key, sector number and mixed using CRC32. | ||
| 250 | * Note that this encryption scheme is vulnerable to watermarking attacks | ||
| 251 | * and should be used for old compatible containers access only. | ||
| 252 | * | ||
| 233 | * plumb: unimplemented, see: | 253 | * plumb: unimplemented, see: |
| 234 | * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 | 254 | * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 |
| 235 | */ | 255 | */ |
| @@ -530,7 +550,7 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, | |||
| 530 | char ctx[crypto_shash_descsize(lmk->hash_tfm)]; | 550 | char ctx[crypto_shash_descsize(lmk->hash_tfm)]; |
| 531 | } sdesc; | 551 | } sdesc; |
| 532 | struct md5_state md5state; | 552 | struct md5_state md5state; |
| 533 | u32 buf[4]; | 553 | __le32 buf[4]; |
| 534 | int i, r; | 554 | int i, r; |
| 535 | 555 | ||
| 536 | sdesc.desc.tfm = lmk->hash_tfm; | 556 | sdesc.desc.tfm = lmk->hash_tfm; |
| @@ -608,6 +628,153 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, | |||
| 608 | return r; | 628 | return r; |
| 609 | } | 629 | } |
| 610 | 630 | ||
| 631 | static void crypt_iv_tcw_dtr(struct crypt_config *cc) | ||
| 632 | { | ||
| 633 | struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; | ||
| 634 | |||
| 635 | kzfree(tcw->iv_seed); | ||
| 636 | tcw->iv_seed = NULL; | ||
| 637 | kzfree(tcw->whitening); | ||
| 638 | tcw->whitening = NULL; | ||
| 639 | |||
| 640 | if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm)) | ||
| 641 | crypto_free_shash(tcw->crc32_tfm); | ||
| 642 | tcw->crc32_tfm = NULL; | ||
| 643 | } | ||
| 644 | |||
| 645 | static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, | ||
| 646 | const char *opts) | ||
| 647 | { | ||
| 648 | struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; | ||
| 649 | |||
| 650 | if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) { | ||
| 651 | ti->error = "Wrong key size for TCW"; | ||
| 652 | return -EINVAL; | ||
| 653 | } | ||
| 654 | |||
| 655 | tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, 0); | ||
| 656 | if (IS_ERR(tcw->crc32_tfm)) { | ||
| 657 | ti->error = "Error initializing CRC32 in TCW"; | ||
| 658 | return PTR_ERR(tcw->crc32_tfm); | ||
| 659 | } | ||
| 660 | |||
| 661 | tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL); | ||
| 662 | tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL); | ||
| 663 | if (!tcw->iv_seed || !tcw->whitening) { | ||
| 664 | crypt_iv_tcw_dtr(cc); | ||
| 665 | ti->error = "Error allocating seed storage in TCW"; | ||
| 666 | return -ENOMEM; | ||
| 667 | } | ||
| 668 | |||
| 669 | return 0; | ||
| 670 | } | ||
| 671 | |||
| 672 | static int crypt_iv_tcw_init(struct crypt_config *cc) | ||
| 673 | { | ||
| 674 | struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; | ||
| 675 | int key_offset = cc->key_size - cc->iv_size - TCW_WHITENING_SIZE; | ||
| 676 | |||
| 677 | memcpy(tcw->iv_seed, &cc->key[key_offset], cc->iv_size); | ||
| 678 | memcpy(tcw->whitening, &cc->key[key_offset + cc->iv_size], | ||
| 679 | TCW_WHITENING_SIZE); | ||
| 680 | |||
| 681 | return 0; | ||
| 682 | } | ||
| 683 | |||
| 684 | static int crypt_iv_tcw_wipe(struct crypt_config *cc) | ||
| 685 | { | ||
| 686 | struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; | ||
| 687 | |||
| 688 | memset(tcw->iv_seed, 0, cc->iv_size); | ||
| 689 | memset(tcw->whitening, 0, TCW_WHITENING_SIZE); | ||
| 690 | |||
| 691 | return 0; | ||
| 692 | } | ||
| 693 | |||
| 694 | static int crypt_iv_tcw_whitening(struct crypt_config *cc, | ||
| 695 | struct dm_crypt_request *dmreq, | ||
| 696 | u8 *data) | ||
| 697 | { | ||
| 698 | struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; | ||
| 699 | u64 sector = cpu_to_le64((u64)dmreq->iv_sector); | ||
| 700 | u8 buf[TCW_WHITENING_SIZE]; | ||
| 701 | struct { | ||
| 702 | struct shash_desc desc; | ||
| 703 | char ctx[crypto_shash_descsize(tcw->crc32_tfm)]; | ||
| 704 | } sdesc; | ||
| 705 | int i, r; | ||
| 706 | |||
| 707 | /* xor whitening with sector number */ | ||
| 708 | memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE); | ||
| 709 | crypto_xor(buf, (u8 *)§or, 8); | ||
| 710 | crypto_xor(&buf[8], (u8 *)§or, 8); | ||
| 711 | |||
| 712 | /* calculate crc32 for every 32bit part and xor it */ | ||
| 713 | sdesc.desc.tfm = tcw->crc32_tfm; | ||
| 714 | sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
| 715 | for (i = 0; i < 4; i++) { | ||
| 716 | r = crypto_shash_init(&sdesc.desc); | ||
| 717 | if (r) | ||
| 718 | goto out; | ||
| 719 | r = crypto_shash_update(&sdesc.desc, &buf[i * 4], 4); | ||
| 720 | if (r) | ||
| 721 | goto out; | ||
| 722 | r = crypto_shash_final(&sdesc.desc, &buf[i * 4]); | ||
| 723 | if (r) | ||
| 724 | goto out; | ||
| 725 | } | ||
| 726 | crypto_xor(&buf[0], &buf[12], 4); | ||
| 727 | crypto_xor(&buf[4], &buf[8], 4); | ||
| 728 | |||
| 729 | /* apply whitening (8 bytes) to whole sector */ | ||
| 730 | for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) | ||
| 731 | crypto_xor(data + i * 8, buf, 8); | ||
| 732 | out: | ||
| 733 | memset(buf, 0, sizeof(buf)); | ||
| 734 | return r; | ||
| 735 | } | ||
| 736 | |||
| 737 | static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, | ||
| 738 | struct dm_crypt_request *dmreq) | ||
| 739 | { | ||
| 740 | struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; | ||
| 741 | u64 sector = cpu_to_le64((u64)dmreq->iv_sector); | ||
| 742 | u8 *src; | ||
| 743 | int r = 0; | ||
| 744 | |||
| 745 | /* Remove whitening from ciphertext */ | ||
| 746 | if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { | ||
| 747 | src = kmap_atomic(sg_page(&dmreq->sg_in)); | ||
| 748 | r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset); | ||
| 749 | kunmap_atomic(src); | ||
| 750 | } | ||
| 751 | |||
| 752 | /* Calculate IV */ | ||
| 753 | memcpy(iv, tcw->iv_seed, cc->iv_size); | ||
| 754 | crypto_xor(iv, (u8 *)§or, 8); | ||
| 755 | if (cc->iv_size > 8) | ||
| 756 | crypto_xor(&iv[8], (u8 *)§or, cc->iv_size - 8); | ||
| 757 | |||
| 758 | return r; | ||
| 759 | } | ||
| 760 | |||
| 761 | static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, | ||
| 762 | struct dm_crypt_request *dmreq) | ||
| 763 | { | ||
| 764 | u8 *dst; | ||
| 765 | int r; | ||
| 766 | |||
| 767 | if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) | ||
| 768 | return 0; | ||
| 769 | |||
| 770 | /* Apply whitening on ciphertext */ | ||
| 771 | dst = kmap_atomic(sg_page(&dmreq->sg_out)); | ||
| 772 | r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset); | ||
| 773 | kunmap_atomic(dst); | ||
| 774 | |||
| 775 | return r; | ||
| 776 | } | ||
| 777 | |||
| 611 | static struct crypt_iv_operations crypt_iv_plain_ops = { | 778 | static struct crypt_iv_operations crypt_iv_plain_ops = { |
| 612 | .generator = crypt_iv_plain_gen | 779 | .generator = crypt_iv_plain_gen |
| 613 | }; | 780 | }; |
| @@ -643,6 +810,15 @@ static struct crypt_iv_operations crypt_iv_lmk_ops = { | |||
| 643 | .post = crypt_iv_lmk_post | 810 | .post = crypt_iv_lmk_post |
| 644 | }; | 811 | }; |
| 645 | 812 | ||
| 813 | static struct crypt_iv_operations crypt_iv_tcw_ops = { | ||
| 814 | .ctr = crypt_iv_tcw_ctr, | ||
| 815 | .dtr = crypt_iv_tcw_dtr, | ||
| 816 | .init = crypt_iv_tcw_init, | ||
| 817 | .wipe = crypt_iv_tcw_wipe, | ||
| 818 | .generator = crypt_iv_tcw_gen, | ||
| 819 | .post = crypt_iv_tcw_post | ||
| 820 | }; | ||
| 821 | |||
| 646 | static void crypt_convert_init(struct crypt_config *cc, | 822 | static void crypt_convert_init(struct crypt_config *cc, |
| 647 | struct convert_context *ctx, | 823 | struct convert_context *ctx, |
| 648 | struct bio *bio_out, struct bio *bio_in, | 824 | struct bio *bio_out, struct bio *bio_in, |
| @@ -1274,9 +1450,12 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) | |||
| 1274 | 1450 | ||
| 1275 | static int crypt_setkey_allcpus(struct crypt_config *cc) | 1451 | static int crypt_setkey_allcpus(struct crypt_config *cc) |
| 1276 | { | 1452 | { |
| 1277 | unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); | 1453 | unsigned subkey_size; |
| 1278 | int err = 0, i, r; | 1454 | int err = 0, i, r; |
| 1279 | 1455 | ||
| 1456 | /* Ignore extra keys (which are used for IV etc) */ | ||
| 1457 | subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count); | ||
| 1458 | |||
| 1280 | for (i = 0; i < cc->tfms_count; i++) { | 1459 | for (i = 0; i < cc->tfms_count; i++) { |
| 1281 | r = crypto_ablkcipher_setkey(cc->tfms[i], | 1460 | r = crypto_ablkcipher_setkey(cc->tfms[i], |
| 1282 | cc->key + (i * subkey_size), | 1461 | cc->key + (i * subkey_size), |
| @@ -1409,6 +1588,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
| 1409 | return -EINVAL; | 1588 | return -EINVAL; |
| 1410 | } | 1589 | } |
| 1411 | cc->key_parts = cc->tfms_count; | 1590 | cc->key_parts = cc->tfms_count; |
| 1591 | cc->key_extra_size = 0; | ||
| 1412 | 1592 | ||
| 1413 | cc->cipher = kstrdup(cipher, GFP_KERNEL); | 1593 | cc->cipher = kstrdup(cipher, GFP_KERNEL); |
| 1414 | if (!cc->cipher) | 1594 | if (!cc->cipher) |
| @@ -1460,13 +1640,6 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
| 1460 | goto bad; | 1640 | goto bad; |
| 1461 | } | 1641 | } |
| 1462 | 1642 | ||
| 1463 | /* Initialize and set key */ | ||
| 1464 | ret = crypt_set_key(cc, key); | ||
| 1465 | if (ret < 0) { | ||
| 1466 | ti->error = "Error decoding and setting key"; | ||
| 1467 | goto bad; | ||
| 1468 | } | ||
| 1469 | |||
| 1470 | /* Initialize IV */ | 1643 | /* Initialize IV */ |
| 1471 | cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); | 1644 | cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); |
| 1472 | if (cc->iv_size) | 1645 | if (cc->iv_size) |
| @@ -1493,18 +1666,33 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
| 1493 | cc->iv_gen_ops = &crypt_iv_null_ops; | 1666 | cc->iv_gen_ops = &crypt_iv_null_ops; |
| 1494 | else if (strcmp(ivmode, "lmk") == 0) { | 1667 | else if (strcmp(ivmode, "lmk") == 0) { |
| 1495 | cc->iv_gen_ops = &crypt_iv_lmk_ops; | 1668 | cc->iv_gen_ops = &crypt_iv_lmk_ops; |
| 1496 | /* Version 2 and 3 is recognised according | 1669 | /* |
| 1670 | * Version 2 and 3 is recognised according | ||
| 1497 | * to length of provided multi-key string. | 1671 | * to length of provided multi-key string. |
| 1498 | * If present (version 3), last key is used as IV seed. | 1672 | * If present (version 3), last key is used as IV seed. |
| 1673 | * All keys (including IV seed) are always the same size. | ||
| 1499 | */ | 1674 | */ |
| 1500 | if (cc->key_size % cc->key_parts) | 1675 | if (cc->key_size % cc->key_parts) { |
| 1501 | cc->key_parts++; | 1676 | cc->key_parts++; |
| 1677 | cc->key_extra_size = cc->key_size / cc->key_parts; | ||
| 1678 | } | ||
| 1679 | } else if (strcmp(ivmode, "tcw") == 0) { | ||
| 1680 | cc->iv_gen_ops = &crypt_iv_tcw_ops; | ||
| 1681 | cc->key_parts += 2; /* IV + whitening */ | ||
| 1682 | cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE; | ||
| 1502 | } else { | 1683 | } else { |
| 1503 | ret = -EINVAL; | 1684 | ret = -EINVAL; |
| 1504 | ti->error = "Invalid IV mode"; | 1685 | ti->error = "Invalid IV mode"; |
| 1505 | goto bad; | 1686 | goto bad; |
| 1506 | } | 1687 | } |
| 1507 | 1688 | ||
| 1689 | /* Initialize and set key */ | ||
| 1690 | ret = crypt_set_key(cc, key); | ||
| 1691 | if (ret < 0) { | ||
| 1692 | ti->error = "Error decoding and setting key"; | ||
| 1693 | goto bad; | ||
| 1694 | } | ||
| 1695 | |||
| 1508 | /* Allocate IV */ | 1696 | /* Allocate IV */ |
| 1509 | if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) { | 1697 | if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) { |
| 1510 | ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); | 1698 | ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); |
| @@ -1817,7 +2005,7 @@ static int crypt_iterate_devices(struct dm_target *ti, | |||
| 1817 | 2005 | ||
| 1818 | static struct target_type crypt_target = { | 2006 | static struct target_type crypt_target = { |
| 1819 | .name = "crypt", | 2007 | .name = "crypt", |
| 1820 | .version = {1, 12, 1}, | 2008 | .version = {1, 13, 0}, |
| 1821 | .module = THIS_MODULE, | 2009 | .module = THIS_MODULE, |
| 1822 | .ctr = crypt_ctr, | 2010 | .ctr = crypt_ctr, |
| 1823 | .dtr = crypt_dtr, | 2011 | .dtr = crypt_dtr, |
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index afe08146f73e..51521429fb59 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
| @@ -57,7 +57,7 @@ struct vers_iter { | |||
| 57 | static struct list_head _name_buckets[NUM_BUCKETS]; | 57 | static struct list_head _name_buckets[NUM_BUCKETS]; |
| 58 | static struct list_head _uuid_buckets[NUM_BUCKETS]; | 58 | static struct list_head _uuid_buckets[NUM_BUCKETS]; |
| 59 | 59 | ||
| 60 | static void dm_hash_remove_all(int keep_open_devices); | 60 | static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred); |
| 61 | 61 | ||
| 62 | /* | 62 | /* |
| 63 | * Guards access to both hash tables. | 63 | * Guards access to both hash tables. |
| @@ -86,7 +86,7 @@ static int dm_hash_init(void) | |||
| 86 | 86 | ||
| 87 | static void dm_hash_exit(void) | 87 | static void dm_hash_exit(void) |
| 88 | { | 88 | { |
| 89 | dm_hash_remove_all(0); | 89 | dm_hash_remove_all(false, false, false); |
| 90 | } | 90 | } |
| 91 | 91 | ||
| 92 | /*----------------------------------------------------------------- | 92 | /*----------------------------------------------------------------- |
| @@ -276,7 +276,7 @@ static struct dm_table *__hash_remove(struct hash_cell *hc) | |||
| 276 | return table; | 276 | return table; |
| 277 | } | 277 | } |
| 278 | 278 | ||
| 279 | static void dm_hash_remove_all(int keep_open_devices) | 279 | static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred) |
| 280 | { | 280 | { |
| 281 | int i, dev_skipped; | 281 | int i, dev_skipped; |
| 282 | struct hash_cell *hc; | 282 | struct hash_cell *hc; |
| @@ -293,7 +293,8 @@ retry: | |||
| 293 | md = hc->md; | 293 | md = hc->md; |
| 294 | dm_get(md); | 294 | dm_get(md); |
| 295 | 295 | ||
| 296 | if (keep_open_devices && dm_lock_for_deletion(md)) { | 296 | if (keep_open_devices && |
| 297 | dm_lock_for_deletion(md, mark_deferred, only_deferred)) { | ||
| 297 | dm_put(md); | 298 | dm_put(md); |
| 298 | dev_skipped++; | 299 | dev_skipped++; |
| 299 | continue; | 300 | continue; |
| @@ -450,6 +451,11 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, | |||
| 450 | return md; | 451 | return md; |
| 451 | } | 452 | } |
| 452 | 453 | ||
| 454 | void dm_deferred_remove(void) | ||
| 455 | { | ||
| 456 | dm_hash_remove_all(true, false, true); | ||
| 457 | } | ||
| 458 | |||
| 453 | /*----------------------------------------------------------------- | 459 | /*----------------------------------------------------------------- |
| 454 | * Implementation of the ioctl commands | 460 | * Implementation of the ioctl commands |
| 455 | *---------------------------------------------------------------*/ | 461 | *---------------------------------------------------------------*/ |
| @@ -461,7 +467,7 @@ typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size); | |||
| 461 | 467 | ||
| 462 | static int remove_all(struct dm_ioctl *param, size_t param_size) | 468 | static int remove_all(struct dm_ioctl *param, size_t param_size) |
| 463 | { | 469 | { |
| 464 | dm_hash_remove_all(1); | 470 | dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false); |
| 465 | param->data_size = 0; | 471 | param->data_size = 0; |
| 466 | return 0; | 472 | return 0; |
| 467 | } | 473 | } |
| @@ -683,6 +689,9 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) | |||
| 683 | if (dm_suspended_md(md)) | 689 | if (dm_suspended_md(md)) |
| 684 | param->flags |= DM_SUSPEND_FLAG; | 690 | param->flags |= DM_SUSPEND_FLAG; |
| 685 | 691 | ||
| 692 | if (dm_test_deferred_remove_flag(md)) | ||
| 693 | param->flags |= DM_DEFERRED_REMOVE; | ||
| 694 | |||
| 686 | param->dev = huge_encode_dev(disk_devt(disk)); | 695 | param->dev = huge_encode_dev(disk_devt(disk)); |
| 687 | 696 | ||
| 688 | /* | 697 | /* |
| @@ -832,8 +841,13 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) | |||
| 832 | /* | 841 | /* |
| 833 | * Ensure the device is not open and nothing further can open it. | 842 | * Ensure the device is not open and nothing further can open it. |
| 834 | */ | 843 | */ |
| 835 | r = dm_lock_for_deletion(md); | 844 | r = dm_lock_for_deletion(md, !!(param->flags & DM_DEFERRED_REMOVE), false); |
| 836 | if (r) { | 845 | if (r) { |
| 846 | if (r == -EBUSY && param->flags & DM_DEFERRED_REMOVE) { | ||
| 847 | up_write(&_hash_lock); | ||
| 848 | dm_put(md); | ||
| 849 | return 0; | ||
| 850 | } | ||
| 837 | DMDEBUG_LIMIT("unable to remove open device %s", hc->name); | 851 | DMDEBUG_LIMIT("unable to remove open device %s", hc->name); |
| 838 | up_write(&_hash_lock); | 852 | up_write(&_hash_lock); |
| 839 | dm_put(md); | 853 | dm_put(md); |
| @@ -848,6 +862,8 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) | |||
| 848 | dm_table_destroy(t); | 862 | dm_table_destroy(t); |
| 849 | } | 863 | } |
| 850 | 864 | ||
| 865 | param->flags &= ~DM_DEFERRED_REMOVE; | ||
| 866 | |||
| 851 | if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) | 867 | if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) |
| 852 | param->flags |= DM_UEVENT_GENERATED_FLAG; | 868 | param->flags |= DM_UEVENT_GENERATED_FLAG; |
| 853 | 869 | ||
| @@ -1469,6 +1485,14 @@ static int message_for_md(struct mapped_device *md, unsigned argc, char **argv, | |||
| 1469 | if (**argv != '@') | 1485 | if (**argv != '@') |
| 1470 | return 2; /* no '@' prefix, deliver to target */ | 1486 | return 2; /* no '@' prefix, deliver to target */ |
| 1471 | 1487 | ||
| 1488 | if (!strcasecmp(argv[0], "@cancel_deferred_remove")) { | ||
| 1489 | if (argc != 1) { | ||
| 1490 | DMERR("Invalid arguments for @cancel_deferred_remove"); | ||
| 1491 | return -EINVAL; | ||
| 1492 | } | ||
| 1493 | return dm_cancel_deferred_remove(md); | ||
| 1494 | } | ||
| 1495 | |||
| 1472 | r = dm_stats_message(md, argc, argv, result, maxlen); | 1496 | r = dm_stats_message(md, argc, argv, result, maxlen); |
| 1473 | if (r < 2) | 1497 | if (r < 2) |
| 1474 | return r; | 1498 | return r; |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index de570a558764..6eb9dc9ef8f3 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
| @@ -87,6 +87,7 @@ struct multipath { | |||
| 87 | unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ | 87 | unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ |
| 88 | unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ | 88 | unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ |
| 89 | unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ | 89 | unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ |
| 90 | unsigned pg_init_disabled:1; /* pg_init is not currently allowed */ | ||
| 90 | 91 | ||
| 91 | unsigned pg_init_retries; /* Number of times to retry pg_init */ | 92 | unsigned pg_init_retries; /* Number of times to retry pg_init */ |
| 92 | unsigned pg_init_count; /* Number of times pg_init called */ | 93 | unsigned pg_init_count; /* Number of times pg_init called */ |
| @@ -390,13 +391,16 @@ static int map_io(struct multipath *m, struct request *clone, | |||
| 390 | if (was_queued) | 391 | if (was_queued) |
| 391 | m->queue_size--; | 392 | m->queue_size--; |
| 392 | 393 | ||
| 393 | if ((pgpath && m->queue_io) || | 394 | if (m->pg_init_required) { |
| 394 | (!pgpath && m->queue_if_no_path)) { | 395 | if (!m->pg_init_in_progress) |
| 396 | queue_work(kmultipathd, &m->process_queued_ios); | ||
| 397 | r = DM_MAPIO_REQUEUE; | ||
| 398 | } else if ((pgpath && m->queue_io) || | ||
| 399 | (!pgpath && m->queue_if_no_path)) { | ||
| 395 | /* Queue for the daemon to resubmit */ | 400 | /* Queue for the daemon to resubmit */ |
| 396 | list_add_tail(&clone->queuelist, &m->queued_ios); | 401 | list_add_tail(&clone->queuelist, &m->queued_ios); |
| 397 | m->queue_size++; | 402 | m->queue_size++; |
| 398 | if ((m->pg_init_required && !m->pg_init_in_progress) || | 403 | if (!m->queue_io) |
| 399 | !m->queue_io) | ||
| 400 | queue_work(kmultipathd, &m->process_queued_ios); | 404 | queue_work(kmultipathd, &m->process_queued_ios); |
| 401 | pgpath = NULL; | 405 | pgpath = NULL; |
| 402 | r = DM_MAPIO_SUBMITTED; | 406 | r = DM_MAPIO_SUBMITTED; |
| @@ -497,7 +501,8 @@ static void process_queued_ios(struct work_struct *work) | |||
| 497 | (!pgpath && !m->queue_if_no_path)) | 501 | (!pgpath && !m->queue_if_no_path)) |
| 498 | must_queue = 0; | 502 | must_queue = 0; |
| 499 | 503 | ||
| 500 | if (m->pg_init_required && !m->pg_init_in_progress && pgpath) | 504 | if (m->pg_init_required && !m->pg_init_in_progress && pgpath && |
| 505 | !m->pg_init_disabled) | ||
| 501 | __pg_init_all_paths(m); | 506 | __pg_init_all_paths(m); |
| 502 | 507 | ||
| 503 | spin_unlock_irqrestore(&m->lock, flags); | 508 | spin_unlock_irqrestore(&m->lock, flags); |
| @@ -942,10 +947,20 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m) | |||
| 942 | 947 | ||
| 943 | static void flush_multipath_work(struct multipath *m) | 948 | static void flush_multipath_work(struct multipath *m) |
| 944 | { | 949 | { |
| 950 | unsigned long flags; | ||
| 951 | |||
| 952 | spin_lock_irqsave(&m->lock, flags); | ||
| 953 | m->pg_init_disabled = 1; | ||
| 954 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 955 | |||
| 945 | flush_workqueue(kmpath_handlerd); | 956 | flush_workqueue(kmpath_handlerd); |
| 946 | multipath_wait_for_pg_init_completion(m); | 957 | multipath_wait_for_pg_init_completion(m); |
| 947 | flush_workqueue(kmultipathd); | 958 | flush_workqueue(kmultipathd); |
| 948 | flush_work(&m->trigger_event); | 959 | flush_work(&m->trigger_event); |
| 960 | |||
| 961 | spin_lock_irqsave(&m->lock, flags); | ||
| 962 | m->pg_init_disabled = 0; | ||
| 963 | spin_unlock_irqrestore(&m->lock, flags); | ||
| 949 | } | 964 | } |
| 950 | 965 | ||
| 951 | static void multipath_dtr(struct dm_target *ti) | 966 | static void multipath_dtr(struct dm_target *ti) |
| @@ -1164,7 +1179,7 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) | |||
| 1164 | 1179 | ||
| 1165 | spin_lock_irqsave(&m->lock, flags); | 1180 | spin_lock_irqsave(&m->lock, flags); |
| 1166 | 1181 | ||
| 1167 | if (m->pg_init_count <= m->pg_init_retries) | 1182 | if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled) |
| 1168 | m->pg_init_required = 1; | 1183 | m->pg_init_required = 1; |
| 1169 | else | 1184 | else |
| 1170 | limit_reached = 1; | 1185 | limit_reached = 1; |
| @@ -1665,6 +1680,11 @@ static int multipath_busy(struct dm_target *ti) | |||
| 1665 | 1680 | ||
| 1666 | spin_lock_irqsave(&m->lock, flags); | 1681 | spin_lock_irqsave(&m->lock, flags); |
| 1667 | 1682 | ||
| 1683 | /* pg_init in progress, requeue until done */ | ||
| 1684 | if (m->pg_init_in_progress) { | ||
| 1685 | busy = 1; | ||
| 1686 | goto out; | ||
| 1687 | } | ||
| 1668 | /* Guess which priority_group will be used at next mapping time */ | 1688 | /* Guess which priority_group will be used at next mapping time */ |
| 1669 | if (unlikely(!m->current_pgpath && m->next_pg)) | 1689 | if (unlikely(!m->current_pgpath && m->next_pg)) |
| 1670 | pg = m->next_pg; | 1690 | pg = m->next_pg; |
| @@ -1714,7 +1734,7 @@ out: | |||
| 1714 | *---------------------------------------------------------------*/ | 1734 | *---------------------------------------------------------------*/ |
| 1715 | static struct target_type multipath_target = { | 1735 | static struct target_type multipath_target = { |
| 1716 | .name = "multipath", | 1736 | .name = "multipath", |
| 1717 | .version = {1, 5, 1}, | 1737 | .version = {1, 6, 0}, |
| 1718 | .module = THIS_MODULE, | 1738 | .module = THIS_MODULE, |
| 1719 | .ctr = multipath_ctr, | 1739 | .ctr = multipath_ctr, |
| 1720 | .dtr = multipath_dtr, | 1740 | .dtr = multipath_dtr, |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 8f8783533ac7..465f08ca62b1 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
| @@ -545,14 +545,28 @@ static int adjoin(struct dm_table *table, struct dm_target *ti) | |||
| 545 | 545 | ||
| 546 | /* | 546 | /* |
| 547 | * Used to dynamically allocate the arg array. | 547 | * Used to dynamically allocate the arg array. |
| 548 | * | ||
| 549 | * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must | ||
| 550 | * process messages even if some device is suspended. These messages have a | ||
| 551 | * small fixed number of arguments. | ||
| 552 | * | ||
| 553 | * On the other hand, dm-switch needs to process bulk data using messages and | ||
| 554 | * excessive use of GFP_NOIO could cause trouble. | ||
| 548 | */ | 555 | */ |
| 549 | static char **realloc_argv(unsigned *array_size, char **old_argv) | 556 | static char **realloc_argv(unsigned *array_size, char **old_argv) |
| 550 | { | 557 | { |
| 551 | char **argv; | 558 | char **argv; |
| 552 | unsigned new_size; | 559 | unsigned new_size; |
| 560 | gfp_t gfp; | ||
| 553 | 561 | ||
| 554 | new_size = *array_size ? *array_size * 2 : 64; | 562 | if (*array_size) { |
| 555 | argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); | 563 | new_size = *array_size * 2; |
| 564 | gfp = GFP_KERNEL; | ||
| 565 | } else { | ||
| 566 | new_size = 8; | ||
| 567 | gfp = GFP_NOIO; | ||
| 568 | } | ||
| 569 | argv = kmalloc(new_size * sizeof(*argv), gfp); | ||
| 556 | if (argv) { | 570 | if (argv) { |
| 557 | memcpy(argv, old_argv, *array_size * sizeof(*argv)); | 571 | memcpy(argv, old_argv, *array_size * sizeof(*argv)); |
| 558 | *array_size = new_size; | 572 | *array_size = new_size; |
| @@ -1548,8 +1562,11 @@ int dm_table_resume_targets(struct dm_table *t) | |||
| 1548 | continue; | 1562 | continue; |
| 1549 | 1563 | ||
| 1550 | r = ti->type->preresume(ti); | 1564 | r = ti->type->preresume(ti); |
| 1551 | if (r) | 1565 | if (r) { |
| 1566 | DMERR("%s: %s: preresume failed, error = %d", | ||
| 1567 | dm_device_name(t->md), ti->type->name, r); | ||
| 1552 | return r; | 1568 | return r; |
| 1569 | } | ||
| 1553 | } | 1570 | } |
| 1554 | 1571 | ||
| 1555 | for (i = 0; i < t->num_targets; i++) { | 1572 | for (i = 0; i < t->num_targets; i++) { |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b3e26c7d1417..0704c523a76b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
| @@ -49,6 +49,11 @@ static unsigned int _major = 0; | |||
| 49 | static DEFINE_IDR(_minor_idr); | 49 | static DEFINE_IDR(_minor_idr); |
| 50 | 50 | ||
| 51 | static DEFINE_SPINLOCK(_minor_lock); | 51 | static DEFINE_SPINLOCK(_minor_lock); |
| 52 | |||
| 53 | static void do_deferred_remove(struct work_struct *w); | ||
| 54 | |||
| 55 | static DECLARE_WORK(deferred_remove_work, do_deferred_remove); | ||
| 56 | |||
| 52 | /* | 57 | /* |
| 53 | * For bio-based dm. | 58 | * For bio-based dm. |
| 54 | * One of these is allocated per bio. | 59 | * One of these is allocated per bio. |
| @@ -116,6 +121,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); | |||
| 116 | #define DMF_DELETING 4 | 121 | #define DMF_DELETING 4 |
| 117 | #define DMF_NOFLUSH_SUSPENDING 5 | 122 | #define DMF_NOFLUSH_SUSPENDING 5 |
| 118 | #define DMF_MERGE_IS_OPTIONAL 6 | 123 | #define DMF_MERGE_IS_OPTIONAL 6 |
| 124 | #define DMF_DEFERRED_REMOVE 7 | ||
| 119 | 125 | ||
| 120 | /* | 126 | /* |
| 121 | * A dummy definition to make RCU happy. | 127 | * A dummy definition to make RCU happy. |
| @@ -299,6 +305,8 @@ out_free_io_cache: | |||
| 299 | 305 | ||
| 300 | static void local_exit(void) | 306 | static void local_exit(void) |
| 301 | { | 307 | { |
| 308 | flush_scheduled_work(); | ||
| 309 | |||
| 302 | kmem_cache_destroy(_rq_tio_cache); | 310 | kmem_cache_destroy(_rq_tio_cache); |
| 303 | kmem_cache_destroy(_io_cache); | 311 | kmem_cache_destroy(_io_cache); |
| 304 | unregister_blkdev(_major, _name); | 312 | unregister_blkdev(_major, _name); |
| @@ -404,7 +412,10 @@ static void dm_blk_close(struct gendisk *disk, fmode_t mode) | |||
| 404 | 412 | ||
| 405 | spin_lock(&_minor_lock); | 413 | spin_lock(&_minor_lock); |
| 406 | 414 | ||
| 407 | atomic_dec(&md->open_count); | 415 | if (atomic_dec_and_test(&md->open_count) && |
| 416 | (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) | ||
| 417 | schedule_work(&deferred_remove_work); | ||
| 418 | |||
| 408 | dm_put(md); | 419 | dm_put(md); |
| 409 | 420 | ||
| 410 | spin_unlock(&_minor_lock); | 421 | spin_unlock(&_minor_lock); |
| @@ -418,14 +429,18 @@ int dm_open_count(struct mapped_device *md) | |||
| 418 | /* | 429 | /* |
| 419 | * Guarantees nothing is using the device before it's deleted. | 430 | * Guarantees nothing is using the device before it's deleted. |
| 420 | */ | 431 | */ |
| 421 | int dm_lock_for_deletion(struct mapped_device *md) | 432 | int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) |
| 422 | { | 433 | { |
| 423 | int r = 0; | 434 | int r = 0; |
| 424 | 435 | ||
| 425 | spin_lock(&_minor_lock); | 436 | spin_lock(&_minor_lock); |
| 426 | 437 | ||
| 427 | if (dm_open_count(md)) | 438 | if (dm_open_count(md)) { |
| 428 | r = -EBUSY; | 439 | r = -EBUSY; |
| 440 | if (mark_deferred) | ||
| 441 | set_bit(DMF_DEFERRED_REMOVE, &md->flags); | ||
| 442 | } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) | ||
| 443 | r = -EEXIST; | ||
| 429 | else | 444 | else |
| 430 | set_bit(DMF_DELETING, &md->flags); | 445 | set_bit(DMF_DELETING, &md->flags); |
| 431 | 446 | ||
| @@ -434,6 +449,27 @@ int dm_lock_for_deletion(struct mapped_device *md) | |||
| 434 | return r; | 449 | return r; |
| 435 | } | 450 | } |
| 436 | 451 | ||
| 452 | int dm_cancel_deferred_remove(struct mapped_device *md) | ||
| 453 | { | ||
| 454 | int r = 0; | ||
| 455 | |||
| 456 | spin_lock(&_minor_lock); | ||
| 457 | |||
| 458 | if (test_bit(DMF_DELETING, &md->flags)) | ||
| 459 | r = -EBUSY; | ||
| 460 | else | ||
| 461 | clear_bit(DMF_DEFERRED_REMOVE, &md->flags); | ||
| 462 | |||
| 463 | spin_unlock(&_minor_lock); | ||
| 464 | |||
| 465 | return r; | ||
| 466 | } | ||
| 467 | |||
| 468 | static void do_deferred_remove(struct work_struct *w) | ||
| 469 | { | ||
| 470 | dm_deferred_remove(); | ||
| 471 | } | ||
| 472 | |||
| 437 | sector_t dm_get_size(struct mapped_device *md) | 473 | sector_t dm_get_size(struct mapped_device *md) |
| 438 | { | 474 | { |
| 439 | return get_capacity(md->disk); | 475 | return get_capacity(md->disk); |
| @@ -2894,6 +2930,11 @@ int dm_suspended_md(struct mapped_device *md) | |||
| 2894 | return test_bit(DMF_SUSPENDED, &md->flags); | 2930 | return test_bit(DMF_SUSPENDED, &md->flags); |
| 2895 | } | 2931 | } |
| 2896 | 2932 | ||
| 2933 | int dm_test_deferred_remove_flag(struct mapped_device *md) | ||
| 2934 | { | ||
| 2935 | return test_bit(DMF_DEFERRED_REMOVE, &md->flags); | ||
| 2936 | } | ||
| 2937 | |||
| 2897 | int dm_suspended(struct dm_target *ti) | 2938 | int dm_suspended(struct dm_target *ti) |
| 2898 | { | 2939 | { |
| 2899 | return dm_suspended_md(dm_table_get_md(ti->table)); | 2940 | return dm_suspended_md(dm_table_get_md(ti->table)); |
diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 1d1ad7b7e527..c57ba550f69e 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h | |||
| @@ -129,6 +129,16 @@ int dm_deleting_md(struct mapped_device *md); | |||
| 129 | int dm_suspended_md(struct mapped_device *md); | 129 | int dm_suspended_md(struct mapped_device *md); |
| 130 | 130 | ||
| 131 | /* | 131 | /* |
| 132 | * Test if the device is scheduled for deferred remove. | ||
| 133 | */ | ||
| 134 | int dm_test_deferred_remove_flag(struct mapped_device *md); | ||
| 135 | |||
| 136 | /* | ||
| 137 | * Try to remove devices marked for deferred removal. | ||
| 138 | */ | ||
| 139 | void dm_deferred_remove(void); | ||
| 140 | |||
| 141 | /* | ||
| 132 | * The device-mapper can be driven through one of two interfaces; | 142 | * The device-mapper can be driven through one of two interfaces; |
| 133 | * ioctl or filesystem, depending which patch you have applied. | 143 | * ioctl or filesystem, depending which patch you have applied. |
| 134 | */ | 144 | */ |
| @@ -158,7 +168,8 @@ void dm_stripe_exit(void); | |||
| 158 | void dm_destroy(struct mapped_device *md); | 168 | void dm_destroy(struct mapped_device *md); |
| 159 | void dm_destroy_immediate(struct mapped_device *md); | 169 | void dm_destroy_immediate(struct mapped_device *md); |
| 160 | int dm_open_count(struct mapped_device *md); | 170 | int dm_open_count(struct mapped_device *md); |
| 161 | int dm_lock_for_deletion(struct mapped_device *md); | 171 | int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred); |
| 172 | int dm_cancel_deferred_remove(struct mapped_device *md); | ||
| 162 | int dm_request_based(struct mapped_device *md); | 173 | int dm_request_based(struct mapped_device *md); |
| 163 | sector_t dm_get_size(struct mapped_device *md); | 174 | sector_t dm_get_size(struct mapped_device *md); |
| 164 | struct dm_stats *dm_get_stats(struct mapped_device *md); | 175 | struct dm_stats *dm_get_stats(struct mapped_device *md); |
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 172147eb1d40..af96e24ec328 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c | |||
| @@ -509,15 +509,18 @@ static int grow_add_tail_block(struct resize *resize) | |||
| 509 | static int grow_needs_more_blocks(struct resize *resize) | 509 | static int grow_needs_more_blocks(struct resize *resize) |
| 510 | { | 510 | { |
| 511 | int r; | 511 | int r; |
| 512 | unsigned old_nr_blocks = resize->old_nr_full_blocks; | ||
| 512 | 513 | ||
| 513 | if (resize->old_nr_entries_in_last_block > 0) { | 514 | if (resize->old_nr_entries_in_last_block > 0) { |
| 515 | old_nr_blocks++; | ||
| 516 | |||
| 514 | r = grow_extend_tail_block(resize, resize->max_entries); | 517 | r = grow_extend_tail_block(resize, resize->max_entries); |
| 515 | if (r) | 518 | if (r) |
| 516 | return r; | 519 | return r; |
| 517 | } | 520 | } |
| 518 | 521 | ||
| 519 | r = insert_full_ablocks(resize->info, resize->size_of_block, | 522 | r = insert_full_ablocks(resize->info, resize->size_of_block, |
| 520 | resize->old_nr_full_blocks, | 523 | old_nr_blocks, |
| 521 | resize->new_nr_full_blocks, | 524 | resize->new_nr_full_blocks, |
| 522 | resize->max_entries, resize->value, | 525 | resize->max_entries, resize->value, |
| 523 | &resize->root); | 526 | &resize->root); |
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index e735a6d5a793..cfbf9617e465 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c | |||
| @@ -140,26 +140,10 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b) | |||
| 140 | 140 | ||
| 141 | static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) | 141 | static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) |
| 142 | { | 142 | { |
| 143 | int r; | ||
| 144 | uint32_t old_count; | ||
| 145 | enum allocation_event ev; | 143 | enum allocation_event ev; |
| 146 | struct sm_disk *smd = container_of(sm, struct sm_disk, sm); | 144 | struct sm_disk *smd = container_of(sm, struct sm_disk, sm); |
| 147 | 145 | ||
| 148 | r = sm_ll_dec(&smd->ll, b, &ev); | 146 | return sm_ll_dec(&smd->ll, b, &ev); |
| 149 | if (!r && (ev == SM_FREE)) { | ||
| 150 | /* | ||
| 151 | * It's only free if it's also free in the last | ||
| 152 | * transaction. | ||
| 153 | */ | ||
| 154 | r = sm_ll_lookup(&smd->old_ll, b, &old_count); | ||
| 155 | if (r) | ||
| 156 | return r; | ||
| 157 | |||
| 158 | if (!old_count) | ||
| 159 | smd->nr_allocated_this_transaction--; | ||
| 160 | } | ||
| 161 | |||
| 162 | return r; | ||
| 163 | } | 147 | } |
| 164 | 148 | ||
| 165 | static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) | 149 | static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) |
