diff options
Diffstat (limited to 'drivers/md/dm-clone-target.c')
-rw-r--r-- | drivers/md/dm-clone-target.c | 2191 |
1 files changed, 2191 insertions, 0 deletions
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c new file mode 100644 index 000000000000..cd6f9e9fc98e --- /dev/null +++ b/drivers/md/dm-clone-target.c | |||
@@ -0,0 +1,2191 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved. | ||
4 | */ | ||
5 | |||
6 | #include <linux/mm.h> | ||
7 | #include <linux/bio.h> | ||
8 | #include <linux/err.h> | ||
9 | #include <linux/hash.h> | ||
10 | #include <linux/list.h> | ||
11 | #include <linux/log2.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <linux/wait.h> | ||
15 | #include <linux/dm-io.h> | ||
16 | #include <linux/mutex.h> | ||
17 | #include <linux/atomic.h> | ||
18 | #include <linux/bitops.h> | ||
19 | #include <linux/blkdev.h> | ||
20 | #include <linux/kdev_t.h> | ||
21 | #include <linux/kernel.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/jiffies.h> | ||
24 | #include <linux/mempool.h> | ||
25 | #include <linux/spinlock.h> | ||
26 | #include <linux/blk_types.h> | ||
27 | #include <linux/dm-kcopyd.h> | ||
28 | #include <linux/workqueue.h> | ||
29 | #include <linux/backing-dev.h> | ||
30 | #include <linux/device-mapper.h> | ||
31 | |||
32 | #include "dm.h" | ||
33 | #include "dm-clone-metadata.h" | ||
34 | |||
35 | #define DM_MSG_PREFIX "clone" | ||
36 | |||
37 | /* | ||
38 | * Minimum and maximum allowed region sizes | ||
39 | */ | ||
40 | #define MIN_REGION_SIZE (1 << 3) /* 4KB */ | ||
41 | #define MAX_REGION_SIZE (1 << 21) /* 1GB */ | ||
42 | |||
43 | #define MIN_HYDRATIONS 256 /* Size of hydration mempool */ | ||
44 | #define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */ | ||
45 | #define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */ | ||
46 | |||
47 | #define COMMIT_PERIOD HZ /* 1 sec */ | ||
48 | |||
49 | /* | ||
50 | * Hydration hash table size: 1 << HASH_TABLE_BITS | ||
51 | */ | ||
52 | #define HASH_TABLE_BITS 15 | ||
53 | |||
54 | DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle, | ||
55 | "A percentage of time allocated for hydrating regions"); | ||
56 | |||
57 | /* Slab cache for struct dm_clone_region_hydration */ | ||
58 | static struct kmem_cache *_hydration_cache; | ||
59 | |||
60 | /* dm-clone metadata modes */ | ||
61 | enum clone_metadata_mode { | ||
62 | CM_WRITE, /* metadata may be changed */ | ||
63 | CM_READ_ONLY, /* metadata may not be changed */ | ||
64 | CM_FAIL, /* all metadata I/O fails */ | ||
65 | }; | ||
66 | |||
67 | struct hash_table_bucket; | ||
68 | |||
69 | struct clone { | ||
70 | struct dm_target *ti; | ||
71 | struct dm_target_callbacks callbacks; | ||
72 | |||
73 | struct dm_dev *metadata_dev; | ||
74 | struct dm_dev *dest_dev; | ||
75 | struct dm_dev *source_dev; | ||
76 | |||
77 | unsigned long nr_regions; | ||
78 | sector_t region_size; | ||
79 | unsigned int region_shift; | ||
80 | |||
81 | /* | ||
82 | * A metadata commit and the actions taken in case it fails should run | ||
83 | * as a single atomic step. | ||
84 | */ | ||
85 | struct mutex commit_lock; | ||
86 | |||
87 | struct dm_clone_metadata *cmd; | ||
88 | |||
89 | /* Region hydration hash table */ | ||
90 | struct hash_table_bucket *ht; | ||
91 | |||
92 | atomic_t ios_in_flight; | ||
93 | |||
94 | wait_queue_head_t hydration_stopped; | ||
95 | |||
96 | mempool_t hydration_pool; | ||
97 | |||
98 | unsigned long last_commit_jiffies; | ||
99 | |||
100 | /* | ||
101 | * We defer incoming WRITE bios for regions that are not hydrated, | ||
102 | * until after these regions have been hydrated. | ||
103 | * | ||
104 | * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the | ||
105 | * metadata have been committed. | ||
106 | */ | ||
107 | spinlock_t lock; | ||
108 | struct bio_list deferred_bios; | ||
109 | struct bio_list deferred_discard_bios; | ||
110 | struct bio_list deferred_flush_bios; | ||
111 | struct bio_list deferred_flush_completions; | ||
112 | |||
113 | /* Maximum number of regions being copied during background hydration. */ | ||
114 | unsigned int hydration_threshold; | ||
115 | |||
116 | /* Number of regions to batch together during background hydration. */ | ||
117 | unsigned int hydration_batch_size; | ||
118 | |||
119 | /* Which region to hydrate next */ | ||
120 | unsigned long hydration_offset; | ||
121 | |||
122 | atomic_t hydrations_in_flight; | ||
123 | |||
124 | /* | ||
125 | * Save a copy of the table line rather than reconstructing it for the | ||
126 | * status. | ||
127 | */ | ||
128 | unsigned int nr_ctr_args; | ||
129 | const char **ctr_args; | ||
130 | |||
131 | struct workqueue_struct *wq; | ||
132 | struct work_struct worker; | ||
133 | struct delayed_work waker; | ||
134 | |||
135 | struct dm_kcopyd_client *kcopyd_client; | ||
136 | |||
137 | enum clone_metadata_mode mode; | ||
138 | unsigned long flags; | ||
139 | }; | ||
140 | |||
141 | /* | ||
142 | * dm-clone flags | ||
143 | */ | ||
144 | #define DM_CLONE_DISCARD_PASSDOWN 0 | ||
145 | #define DM_CLONE_HYDRATION_ENABLED 1 | ||
146 | #define DM_CLONE_HYDRATION_SUSPENDED 2 | ||
147 | |||
148 | /*---------------------------------------------------------------------------*/ | ||
149 | |||
150 | /* | ||
151 | * Metadata failure handling. | ||
152 | */ | ||
153 | static enum clone_metadata_mode get_clone_mode(struct clone *clone) | ||
154 | { | ||
155 | return READ_ONCE(clone->mode); | ||
156 | } | ||
157 | |||
158 | static const char *clone_device_name(struct clone *clone) | ||
159 | { | ||
160 | return dm_table_device_name(clone->ti->table); | ||
161 | } | ||
162 | |||
163 | static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode) | ||
164 | { | ||
165 | const char *descs[] = { | ||
166 | "read-write", | ||
167 | "read-only", | ||
168 | "fail" | ||
169 | }; | ||
170 | |||
171 | enum clone_metadata_mode old_mode = get_clone_mode(clone); | ||
172 | |||
173 | /* Never move out of fail mode */ | ||
174 | if (old_mode == CM_FAIL) | ||
175 | new_mode = CM_FAIL; | ||
176 | |||
177 | switch (new_mode) { | ||
178 | case CM_FAIL: | ||
179 | case CM_READ_ONLY: | ||
180 | dm_clone_metadata_set_read_only(clone->cmd); | ||
181 | break; | ||
182 | |||
183 | case CM_WRITE: | ||
184 | dm_clone_metadata_set_read_write(clone->cmd); | ||
185 | break; | ||
186 | } | ||
187 | |||
188 | WRITE_ONCE(clone->mode, new_mode); | ||
189 | |||
190 | if (new_mode != old_mode) { | ||
191 | dm_table_event(clone->ti->table); | ||
192 | DMINFO("%s: Switching to %s mode", clone_device_name(clone), | ||
193 | descs[(int)new_mode]); | ||
194 | } | ||
195 | } | ||
196 | |||
197 | static void __abort_transaction(struct clone *clone) | ||
198 | { | ||
199 | const char *dev_name = clone_device_name(clone); | ||
200 | |||
201 | if (get_clone_mode(clone) >= CM_READ_ONLY) | ||
202 | return; | ||
203 | |||
204 | DMERR("%s: Aborting current metadata transaction", dev_name); | ||
205 | if (dm_clone_metadata_abort(clone->cmd)) { | ||
206 | DMERR("%s: Failed to abort metadata transaction", dev_name); | ||
207 | __set_clone_mode(clone, CM_FAIL); | ||
208 | } | ||
209 | } | ||
210 | |||
211 | static void __reload_in_core_bitset(struct clone *clone) | ||
212 | { | ||
213 | const char *dev_name = clone_device_name(clone); | ||
214 | |||
215 | if (get_clone_mode(clone) == CM_FAIL) | ||
216 | return; | ||
217 | |||
218 | /* Reload the on-disk bitset */ | ||
219 | DMINFO("%s: Reloading on-disk bitmap", dev_name); | ||
220 | if (dm_clone_reload_in_core_bitset(clone->cmd)) { | ||
221 | DMERR("%s: Failed to reload on-disk bitmap", dev_name); | ||
222 | __set_clone_mode(clone, CM_FAIL); | ||
223 | } | ||
224 | } | ||
225 | |||
226 | static void __metadata_operation_failed(struct clone *clone, const char *op, int r) | ||
227 | { | ||
228 | DMERR("%s: Metadata operation `%s' failed: error = %d", | ||
229 | clone_device_name(clone), op, r); | ||
230 | |||
231 | __abort_transaction(clone); | ||
232 | __set_clone_mode(clone, CM_READ_ONLY); | ||
233 | |||
234 | /* | ||
235 | * dm_clone_reload_in_core_bitset() may run concurrently with either | ||
236 | * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but | ||
237 | * it's safe as we have already set the metadata to read-only mode. | ||
238 | */ | ||
239 | __reload_in_core_bitset(clone); | ||
240 | } | ||
241 | |||
242 | /*---------------------------------------------------------------------------*/ | ||
243 | |||
244 | /* Wake up anyone waiting for region hydrations to stop */ | ||
245 | static inline void wakeup_hydration_waiters(struct clone *clone) | ||
246 | { | ||
247 | wake_up_all(&clone->hydration_stopped); | ||
248 | } | ||
249 | |||
250 | static inline void wake_worker(struct clone *clone) | ||
251 | { | ||
252 | queue_work(clone->wq, &clone->worker); | ||
253 | } | ||
254 | |||
255 | /*---------------------------------------------------------------------------*/ | ||
256 | |||
257 | /* | ||
258 | * bio helper functions. | ||
259 | */ | ||
260 | static inline void remap_to_source(struct clone *clone, struct bio *bio) | ||
261 | { | ||
262 | bio_set_dev(bio, clone->source_dev->bdev); | ||
263 | } | ||
264 | |||
265 | static inline void remap_to_dest(struct clone *clone, struct bio *bio) | ||
266 | { | ||
267 | bio_set_dev(bio, clone->dest_dev->bdev); | ||
268 | } | ||
269 | |||
270 | static bool bio_triggers_commit(struct clone *clone, struct bio *bio) | ||
271 | { | ||
272 | return op_is_flush(bio->bi_opf) && | ||
273 | dm_clone_changed_this_transaction(clone->cmd); | ||
274 | } | ||
275 | |||
276 | /* Get the address of the region in sectors */ | ||
277 | static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr) | ||
278 | { | ||
279 | return (region_nr << clone->region_shift); | ||
280 | } | ||
281 | |||
282 | /* Get the region number of the bio */ | ||
283 | static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio) | ||
284 | { | ||
285 | return (bio->bi_iter.bi_sector >> clone->region_shift); | ||
286 | } | ||
287 | |||
288 | /* Get the region range covered by the bio */ | ||
289 | static void bio_region_range(struct clone *clone, struct bio *bio, | ||
290 | unsigned long *rs, unsigned long *re) | ||
291 | { | ||
292 | *rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size); | ||
293 | *re = bio_end_sector(bio) >> clone->region_shift; | ||
294 | } | ||
295 | |||
296 | /* Check whether a bio overwrites a region */ | ||
297 | static inline bool is_overwrite_bio(struct clone *clone, struct bio *bio) | ||
298 | { | ||
299 | return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size); | ||
300 | } | ||
301 | |||
302 | static void fail_bios(struct bio_list *bios, blk_status_t status) | ||
303 | { | ||
304 | struct bio *bio; | ||
305 | |||
306 | while ((bio = bio_list_pop(bios))) { | ||
307 | bio->bi_status = status; | ||
308 | bio_endio(bio); | ||
309 | } | ||
310 | } | ||
311 | |||
312 | static void submit_bios(struct bio_list *bios) | ||
313 | { | ||
314 | struct bio *bio; | ||
315 | struct blk_plug plug; | ||
316 | |||
317 | blk_start_plug(&plug); | ||
318 | |||
319 | while ((bio = bio_list_pop(bios))) | ||
320 | generic_make_request(bio); | ||
321 | |||
322 | blk_finish_plug(&plug); | ||
323 | } | ||
324 | |||
325 | /* | ||
326 | * Submit bio to the underlying device. | ||
327 | * | ||
328 | * If the bio triggers a commit, delay it, until after the metadata have been | ||
329 | * committed. | ||
330 | * | ||
331 | * NOTE: The bio remapping must be performed by the caller. | ||
332 | */ | ||
333 | static void issue_bio(struct clone *clone, struct bio *bio) | ||
334 | { | ||
335 | unsigned long flags; | ||
336 | |||
337 | if (!bio_triggers_commit(clone, bio)) { | ||
338 | generic_make_request(bio); | ||
339 | return; | ||
340 | } | ||
341 | |||
342 | /* | ||
343 | * If the metadata mode is RO or FAIL we won't be able to commit the | ||
344 | * metadata, so we complete the bio with an error. | ||
345 | */ | ||
346 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | ||
347 | bio_io_error(bio); | ||
348 | return; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * Batch together any bios that trigger commits and then issue a single | ||
353 | * commit for them in process_deferred_flush_bios(). | ||
354 | */ | ||
355 | spin_lock_irqsave(&clone->lock, flags); | ||
356 | bio_list_add(&clone->deferred_flush_bios, bio); | ||
357 | spin_unlock_irqrestore(&clone->lock, flags); | ||
358 | |||
359 | wake_worker(clone); | ||
360 | } | ||
361 | |||
362 | /* | ||
363 | * Remap bio to the destination device and submit it. | ||
364 | * | ||
365 | * If the bio triggers a commit, delay it, until after the metadata have been | ||
366 | * committed. | ||
367 | */ | ||
368 | static void remap_and_issue(struct clone *clone, struct bio *bio) | ||
369 | { | ||
370 | remap_to_dest(clone, bio); | ||
371 | issue_bio(clone, bio); | ||
372 | } | ||
373 | |||
374 | /* | ||
375 | * Issue bios that have been deferred until after their region has finished | ||
376 | * hydrating. | ||
377 | * | ||
378 | * We delegate the bio submission to the worker thread, so this is safe to call | ||
379 | * from interrupt context. | ||
380 | */ | ||
381 | static void issue_deferred_bios(struct clone *clone, struct bio_list *bios) | ||
382 | { | ||
383 | struct bio *bio; | ||
384 | unsigned long flags; | ||
385 | struct bio_list flush_bios = BIO_EMPTY_LIST; | ||
386 | struct bio_list normal_bios = BIO_EMPTY_LIST; | ||
387 | |||
388 | if (bio_list_empty(bios)) | ||
389 | return; | ||
390 | |||
391 | while ((bio = bio_list_pop(bios))) { | ||
392 | if (bio_triggers_commit(clone, bio)) | ||
393 | bio_list_add(&flush_bios, bio); | ||
394 | else | ||
395 | bio_list_add(&normal_bios, bio); | ||
396 | } | ||
397 | |||
398 | spin_lock_irqsave(&clone->lock, flags); | ||
399 | bio_list_merge(&clone->deferred_bios, &normal_bios); | ||
400 | bio_list_merge(&clone->deferred_flush_bios, &flush_bios); | ||
401 | spin_unlock_irqrestore(&clone->lock, flags); | ||
402 | |||
403 | wake_worker(clone); | ||
404 | } | ||
405 | |||
406 | static void complete_overwrite_bio(struct clone *clone, struct bio *bio) | ||
407 | { | ||
408 | unsigned long flags; | ||
409 | |||
410 | /* | ||
411 | * If the bio has the REQ_FUA flag set we must commit the metadata | ||
412 | * before signaling its completion. | ||
413 | * | ||
414 | * complete_overwrite_bio() is only called by hydration_complete(), | ||
415 | * after having successfully updated the metadata. This means we don't | ||
416 | * need to call dm_clone_changed_this_transaction() to check if the | ||
417 | * metadata has changed and thus we can avoid taking the metadata spin | ||
418 | * lock. | ||
419 | */ | ||
420 | if (!(bio->bi_opf & REQ_FUA)) { | ||
421 | bio_endio(bio); | ||
422 | return; | ||
423 | } | ||
424 | |||
425 | /* | ||
426 | * If the metadata mode is RO or FAIL we won't be able to commit the | ||
427 | * metadata, so we complete the bio with an error. | ||
428 | */ | ||
429 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | ||
430 | bio_io_error(bio); | ||
431 | return; | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * Batch together any bios that trigger commits and then issue a single | ||
436 | * commit for them in process_deferred_flush_bios(). | ||
437 | */ | ||
438 | spin_lock_irqsave(&clone->lock, flags); | ||
439 | bio_list_add(&clone->deferred_flush_completions, bio); | ||
440 | spin_unlock_irqrestore(&clone->lock, flags); | ||
441 | |||
442 | wake_worker(clone); | ||
443 | } | ||
444 | |||
445 | static void trim_bio(struct bio *bio, sector_t sector, unsigned int len) | ||
446 | { | ||
447 | bio->bi_iter.bi_sector = sector; | ||
448 | bio->bi_iter.bi_size = to_bytes(len); | ||
449 | } | ||
450 | |||
451 | static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success) | ||
452 | { | ||
453 | unsigned long rs, re; | ||
454 | |||
455 | /* | ||
456 | * If the destination device supports discards, remap and trim the | ||
457 | * discard bio and pass it down. Otherwise complete the bio | ||
458 | * immediately. | ||
459 | */ | ||
460 | if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) { | ||
461 | remap_to_dest(clone, bio); | ||
462 | bio_region_range(clone, bio, &rs, &re); | ||
463 | trim_bio(bio, rs << clone->region_shift, | ||
464 | (re - rs) << clone->region_shift); | ||
465 | generic_make_request(bio); | ||
466 | } else | ||
467 | bio_endio(bio); | ||
468 | } | ||
469 | |||
470 | static void process_discard_bio(struct clone *clone, struct bio *bio) | ||
471 | { | ||
472 | unsigned long rs, re, flags; | ||
473 | |||
474 | bio_region_range(clone, bio, &rs, &re); | ||
475 | BUG_ON(re > clone->nr_regions); | ||
476 | |||
477 | if (unlikely(rs == re)) { | ||
478 | bio_endio(bio); | ||
479 | return; | ||
480 | } | ||
481 | |||
482 | /* | ||
483 | * The covered regions are already hydrated so we just need to pass | ||
484 | * down the discard. | ||
485 | */ | ||
486 | if (dm_clone_is_range_hydrated(clone->cmd, rs, re - rs)) { | ||
487 | complete_discard_bio(clone, bio, true); | ||
488 | return; | ||
489 | } | ||
490 | |||
491 | /* | ||
492 | * If the metadata mode is RO or FAIL we won't be able to update the | ||
493 | * metadata for the regions covered by the discard so we just ignore | ||
494 | * it. | ||
495 | */ | ||
496 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | ||
497 | bio_endio(bio); | ||
498 | return; | ||
499 | } | ||
500 | |||
501 | /* | ||
502 | * Defer discard processing. | ||
503 | */ | ||
504 | spin_lock_irqsave(&clone->lock, flags); | ||
505 | bio_list_add(&clone->deferred_discard_bios, bio); | ||
506 | spin_unlock_irqrestore(&clone->lock, flags); | ||
507 | |||
508 | wake_worker(clone); | ||
509 | } | ||
510 | |||
511 | /*---------------------------------------------------------------------------*/ | ||
512 | |||
513 | /* | ||
514 | * dm-clone region hydrations. | ||
515 | */ | ||
516 | struct dm_clone_region_hydration { | ||
517 | struct clone *clone; | ||
518 | unsigned long region_nr; | ||
519 | |||
520 | struct bio *overwrite_bio; | ||
521 | bio_end_io_t *overwrite_bio_end_io; | ||
522 | |||
523 | struct bio_list deferred_bios; | ||
524 | |||
525 | blk_status_t status; | ||
526 | |||
527 | /* Used by hydration batching */ | ||
528 | struct list_head list; | ||
529 | |||
530 | /* Used by hydration hash table */ | ||
531 | struct hlist_node h; | ||
532 | }; | ||
533 | |||
534 | /* | ||
535 | * Hydration hash table implementation. | ||
536 | * | ||
537 | * Ideally we would like to use list_bl, which uses bit spin locks and employs | ||
538 | * the least significant bit of the list head to lock the corresponding bucket, | ||
539 | * reducing the memory overhead for the locks. But, currently, list_bl and bit | ||
540 | * spin locks don't support IRQ safe versions. Since we have to take the lock | ||
541 | * in both process and interrupt context, we must fall back to using regular | ||
542 | * spin locks; one per hash table bucket. | ||
543 | */ | ||
544 | struct hash_table_bucket { | ||
545 | struct hlist_head head; | ||
546 | |||
547 | /* Spinlock protecting the bucket */ | ||
548 | spinlock_t lock; | ||
549 | }; | ||
550 | |||
551 | #define bucket_lock_irqsave(bucket, flags) \ | ||
552 | spin_lock_irqsave(&(bucket)->lock, flags) | ||
553 | |||
554 | #define bucket_unlock_irqrestore(bucket, flags) \ | ||
555 | spin_unlock_irqrestore(&(bucket)->lock, flags) | ||
556 | |||
557 | static int hash_table_init(struct clone *clone) | ||
558 | { | ||
559 | unsigned int i, sz; | ||
560 | struct hash_table_bucket *bucket; | ||
561 | |||
562 | sz = 1 << HASH_TABLE_BITS; | ||
563 | |||
564 | clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL); | ||
565 | if (!clone->ht) | ||
566 | return -ENOMEM; | ||
567 | |||
568 | for (i = 0; i < sz; i++) { | ||
569 | bucket = clone->ht + i; | ||
570 | |||
571 | INIT_HLIST_HEAD(&bucket->head); | ||
572 | spin_lock_init(&bucket->lock); | ||
573 | } | ||
574 | |||
575 | return 0; | ||
576 | } | ||
577 | |||
578 | static void hash_table_exit(struct clone *clone) | ||
579 | { | ||
580 | kvfree(clone->ht); | ||
581 | } | ||
582 | |||
583 | static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone, | ||
584 | unsigned long region_nr) | ||
585 | { | ||
586 | return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)]; | ||
587 | } | ||
588 | |||
589 | /* | ||
590 | * Search hash table for a hydration with hd->region_nr == region_nr | ||
591 | * | ||
592 | * NOTE: Must be called with the bucket lock held | ||
593 | */ | ||
594 | struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket, | ||
595 | unsigned long region_nr) | ||
596 | { | ||
597 | struct dm_clone_region_hydration *hd; | ||
598 | |||
599 | hlist_for_each_entry(hd, &bucket->head, h) { | ||
600 | if (hd->region_nr == region_nr) | ||
601 | return hd; | ||
602 | } | ||
603 | |||
604 | return NULL; | ||
605 | } | ||
606 | |||
607 | /* | ||
608 | * Insert a hydration into the hash table. | ||
609 | * | ||
610 | * NOTE: Must be called with the bucket lock held. | ||
611 | */ | ||
612 | static inline void __insert_region_hydration(struct hash_table_bucket *bucket, | ||
613 | struct dm_clone_region_hydration *hd) | ||
614 | { | ||
615 | hlist_add_head(&hd->h, &bucket->head); | ||
616 | } | ||
617 | |||
618 | /* | ||
619 | * This function inserts a hydration into the hash table, unless someone else | ||
620 | * managed to insert a hydration for the same region first. In the latter case | ||
621 | * it returns the existing hydration descriptor for this region. | ||
622 | * | ||
623 | * NOTE: Must be called with the hydration hash table lock held. | ||
624 | */ | ||
625 | static struct dm_clone_region_hydration * | ||
626 | __find_or_insert_region_hydration(struct hash_table_bucket *bucket, | ||
627 | struct dm_clone_region_hydration *hd) | ||
628 | { | ||
629 | struct dm_clone_region_hydration *hd2; | ||
630 | |||
631 | hd2 = __hash_find(bucket, hd->region_nr); | ||
632 | if (hd2) | ||
633 | return hd2; | ||
634 | |||
635 | __insert_region_hydration(bucket, hd); | ||
636 | |||
637 | return hd; | ||
638 | } | ||
639 | |||
640 | /*---------------------------------------------------------------------------*/ | ||
641 | |||
642 | /* Allocate a hydration */ | ||
643 | static struct dm_clone_region_hydration *alloc_hydration(struct clone *clone) | ||
644 | { | ||
645 | struct dm_clone_region_hydration *hd; | ||
646 | |||
647 | /* | ||
648 | * Allocate a hydration from the hydration mempool. | ||
649 | * This might block but it can't fail. | ||
650 | */ | ||
651 | hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO); | ||
652 | hd->clone = clone; | ||
653 | |||
654 | return hd; | ||
655 | } | ||
656 | |||
657 | static inline void free_hydration(struct dm_clone_region_hydration *hd) | ||
658 | { | ||
659 | mempool_free(hd, &hd->clone->hydration_pool); | ||
660 | } | ||
661 | |||
662 | /* Initialize a hydration */ | ||
663 | static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr) | ||
664 | { | ||
665 | hd->region_nr = region_nr; | ||
666 | hd->overwrite_bio = NULL; | ||
667 | bio_list_init(&hd->deferred_bios); | ||
668 | hd->status = 0; | ||
669 | |||
670 | INIT_LIST_HEAD(&hd->list); | ||
671 | INIT_HLIST_NODE(&hd->h); | ||
672 | } | ||
673 | |||
674 | /*---------------------------------------------------------------------------*/ | ||
675 | |||
676 | /* | ||
677 | * Update dm-clone's metadata after a region has finished hydrating and remove | ||
678 | * hydration from the hash table. | ||
679 | */ | ||
680 | static int hydration_update_metadata(struct dm_clone_region_hydration *hd) | ||
681 | { | ||
682 | int r = 0; | ||
683 | unsigned long flags; | ||
684 | struct hash_table_bucket *bucket; | ||
685 | struct clone *clone = hd->clone; | ||
686 | |||
687 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) | ||
688 | r = -EPERM; | ||
689 | |||
690 | /* Update the metadata */ | ||
691 | if (likely(!r) && hd->status == BLK_STS_OK) | ||
692 | r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr); | ||
693 | |||
694 | bucket = get_hash_table_bucket(clone, hd->region_nr); | ||
695 | |||
696 | /* Remove hydration from hash table */ | ||
697 | bucket_lock_irqsave(bucket, flags); | ||
698 | hlist_del(&hd->h); | ||
699 | bucket_unlock_irqrestore(bucket, flags); | ||
700 | |||
701 | return r; | ||
702 | } | ||
703 | |||
704 | /* | ||
705 | * Complete a region's hydration: | ||
706 | * | ||
707 | * 1. Update dm-clone's metadata. | ||
708 | * 2. Remove hydration from hash table. | ||
709 | * 3. Complete overwrite bio. | ||
710 | * 4. Issue deferred bios. | ||
711 | * 5. If this was the last hydration, wake up anyone waiting for | ||
712 | * hydrations to finish. | ||
713 | */ | ||
714 | static void hydration_complete(struct dm_clone_region_hydration *hd) | ||
715 | { | ||
716 | int r; | ||
717 | blk_status_t status; | ||
718 | struct clone *clone = hd->clone; | ||
719 | |||
720 | r = hydration_update_metadata(hd); | ||
721 | |||
722 | if (hd->status == BLK_STS_OK && likely(!r)) { | ||
723 | if (hd->overwrite_bio) | ||
724 | complete_overwrite_bio(clone, hd->overwrite_bio); | ||
725 | |||
726 | issue_deferred_bios(clone, &hd->deferred_bios); | ||
727 | } else { | ||
728 | status = r ? BLK_STS_IOERR : hd->status; | ||
729 | |||
730 | if (hd->overwrite_bio) | ||
731 | bio_list_add(&hd->deferred_bios, hd->overwrite_bio); | ||
732 | |||
733 | fail_bios(&hd->deferred_bios, status); | ||
734 | } | ||
735 | |||
736 | free_hydration(hd); | ||
737 | |||
738 | if (atomic_dec_and_test(&clone->hydrations_in_flight)) | ||
739 | wakeup_hydration_waiters(clone); | ||
740 | } | ||
741 | |||
742 | static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context) | ||
743 | { | ||
744 | blk_status_t status; | ||
745 | |||
746 | struct dm_clone_region_hydration *tmp, *hd = context; | ||
747 | struct clone *clone = hd->clone; | ||
748 | |||
749 | LIST_HEAD(batched_hydrations); | ||
750 | |||
751 | if (read_err || write_err) { | ||
752 | DMERR_LIMIT("%s: hydration failed", clone_device_name(clone)); | ||
753 | status = BLK_STS_IOERR; | ||
754 | } else { | ||
755 | status = BLK_STS_OK; | ||
756 | } | ||
757 | list_splice_tail(&hd->list, &batched_hydrations); | ||
758 | |||
759 | hd->status = status; | ||
760 | hydration_complete(hd); | ||
761 | |||
762 | /* Complete batched hydrations */ | ||
763 | list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) { | ||
764 | hd->status = status; | ||
765 | hydration_complete(hd); | ||
766 | } | ||
767 | |||
768 | /* Continue background hydration, if there is no I/O in-flight */ | ||
769 | if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) && | ||
770 | !atomic_read(&clone->ios_in_flight)) | ||
771 | wake_worker(clone); | ||
772 | } | ||
773 | |||
774 | static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions) | ||
775 | { | ||
776 | unsigned long region_start, region_end; | ||
777 | sector_t tail_size, region_size, total_size; | ||
778 | struct dm_io_region from, to; | ||
779 | struct clone *clone = hd->clone; | ||
780 | |||
781 | region_size = clone->region_size; | ||
782 | region_start = hd->region_nr; | ||
783 | region_end = region_start + nr_regions - 1; | ||
784 | |||
785 | total_size = (nr_regions - 1) << clone->region_shift; | ||
786 | |||
787 | if (region_end == clone->nr_regions - 1) { | ||
788 | /* | ||
789 | * The last region of the target might be smaller than | ||
790 | * region_size. | ||
791 | */ | ||
792 | tail_size = clone->ti->len & (region_size - 1); | ||
793 | if (!tail_size) | ||
794 | tail_size = region_size; | ||
795 | } else { | ||
796 | tail_size = region_size; | ||
797 | } | ||
798 | |||
799 | total_size += tail_size; | ||
800 | |||
801 | from.bdev = clone->source_dev->bdev; | ||
802 | from.sector = region_to_sector(clone, region_start); | ||
803 | from.count = total_size; | ||
804 | |||
805 | to.bdev = clone->dest_dev->bdev; | ||
806 | to.sector = from.sector; | ||
807 | to.count = from.count; | ||
808 | |||
809 | /* Issue copy */ | ||
810 | atomic_add(nr_regions, &clone->hydrations_in_flight); | ||
811 | dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0, | ||
812 | hydration_kcopyd_callback, hd); | ||
813 | } | ||
814 | |||
815 | static void overwrite_endio(struct bio *bio) | ||
816 | { | ||
817 | struct dm_clone_region_hydration *hd = bio->bi_private; | ||
818 | |||
819 | bio->bi_end_io = hd->overwrite_bio_end_io; | ||
820 | hd->status = bio->bi_status; | ||
821 | |||
822 | hydration_complete(hd); | ||
823 | } | ||
824 | |||
825 | static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio) | ||
826 | { | ||
827 | /* | ||
828 | * We don't need to save and restore bio->bi_private because device | ||
829 | * mapper core generates a new bio for us to use, with clean | ||
830 | * bi_private. | ||
831 | */ | ||
832 | hd->overwrite_bio = bio; | ||
833 | hd->overwrite_bio_end_io = bio->bi_end_io; | ||
834 | |||
835 | bio->bi_end_io = overwrite_endio; | ||
836 | bio->bi_private = hd; | ||
837 | |||
838 | atomic_inc(&hd->clone->hydrations_in_flight); | ||
839 | generic_make_request(bio); | ||
840 | } | ||
841 | |||
842 | /* | ||
843 | * Hydrate bio's region. | ||
844 | * | ||
845 | * This function starts the hydration of the bio's region and puts the bio in | ||
846 | * the list of deferred bios for this region. In case, by the time this | ||
847 | * function is called, the region has finished hydrating it's submitted to the | ||
848 | * destination device. | ||
849 | * | ||
850 | * NOTE: The bio remapping must be performed by the caller. | ||
851 | */ | ||
852 | static void hydrate_bio_region(struct clone *clone, struct bio *bio) | ||
853 | { | ||
854 | unsigned long flags; | ||
855 | unsigned long region_nr; | ||
856 | struct hash_table_bucket *bucket; | ||
857 | struct dm_clone_region_hydration *hd, *hd2; | ||
858 | |||
859 | region_nr = bio_to_region(clone, bio); | ||
860 | bucket = get_hash_table_bucket(clone, region_nr); | ||
861 | |||
862 | bucket_lock_irqsave(bucket, flags); | ||
863 | |||
864 | hd = __hash_find(bucket, region_nr); | ||
865 | if (hd) { | ||
866 | /* Someone else is hydrating the region */ | ||
867 | bio_list_add(&hd->deferred_bios, bio); | ||
868 | bucket_unlock_irqrestore(bucket, flags); | ||
869 | return; | ||
870 | } | ||
871 | |||
872 | if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { | ||
873 | /* The region has been hydrated */ | ||
874 | bucket_unlock_irqrestore(bucket, flags); | ||
875 | issue_bio(clone, bio); | ||
876 | return; | ||
877 | } | ||
878 | |||
879 | /* | ||
880 | * We must allocate a hydration descriptor and start the hydration of | ||
881 | * the corresponding region. | ||
882 | */ | ||
883 | bucket_unlock_irqrestore(bucket, flags); | ||
884 | |||
885 | hd = alloc_hydration(clone); | ||
886 | hydration_init(hd, region_nr); | ||
887 | |||
888 | bucket_lock_irqsave(bucket, flags); | ||
889 | |||
890 | /* Check if the region has been hydrated in the meantime. */ | ||
891 | if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { | ||
892 | bucket_unlock_irqrestore(bucket, flags); | ||
893 | free_hydration(hd); | ||
894 | issue_bio(clone, bio); | ||
895 | return; | ||
896 | } | ||
897 | |||
898 | hd2 = __find_or_insert_region_hydration(bucket, hd); | ||
899 | if (hd2 != hd) { | ||
900 | /* Someone else started the region's hydration. */ | ||
901 | bio_list_add(&hd2->deferred_bios, bio); | ||
902 | bucket_unlock_irqrestore(bucket, flags); | ||
903 | free_hydration(hd); | ||
904 | return; | ||
905 | } | ||
906 | |||
907 | /* | ||
908 | * If the metadata mode is RO or FAIL then there is no point starting a | ||
909 | * hydration, since we will not be able to update the metadata when the | ||
910 | * hydration finishes. | ||
911 | */ | ||
912 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | ||
913 | hlist_del(&hd->h); | ||
914 | bucket_unlock_irqrestore(bucket, flags); | ||
915 | free_hydration(hd); | ||
916 | bio_io_error(bio); | ||
917 | return; | ||
918 | } | ||
919 | |||
920 | /* | ||
921 | * Start region hydration. | ||
922 | * | ||
923 | * If a bio overwrites a region, i.e., its size is equal to the | ||
924 | * region's size, then we don't need to copy the region from the source | ||
925 | * to the destination device. | ||
926 | */ | ||
927 | if (is_overwrite_bio(clone, bio)) { | ||
928 | bucket_unlock_irqrestore(bucket, flags); | ||
929 | hydration_overwrite(hd, bio); | ||
930 | } else { | ||
931 | bio_list_add(&hd->deferred_bios, bio); | ||
932 | bucket_unlock_irqrestore(bucket, flags); | ||
933 | hydration_copy(hd, 1); | ||
934 | } | ||
935 | } | ||
936 | |||
937 | /*---------------------------------------------------------------------------*/ | ||
938 | |||
939 | /* | ||
940 | * Background hydrations. | ||
941 | */ | ||
942 | |||
943 | /* | ||
944 | * Batch region hydrations. | ||
945 | * | ||
946 | * To better utilize device bandwidth we batch together the hydration of | ||
947 | * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which | ||
948 | * is good for small, random write performance (because of the overwriting of | ||
949 | * un-hydrated regions) and at the same time issue big copy requests to kcopyd | ||
950 | * to achieve high hydration bandwidth. | ||
951 | */ | ||
952 | struct batch_info { | ||
953 | struct dm_clone_region_hydration *head; | ||
954 | unsigned int nr_batched_regions; | ||
955 | }; | ||
956 | |||
957 | static void __batch_hydration(struct batch_info *batch, | ||
958 | struct dm_clone_region_hydration *hd) | ||
959 | { | ||
960 | struct clone *clone = hd->clone; | ||
961 | unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size); | ||
962 | |||
963 | if (batch->head) { | ||
964 | /* Try to extend the current batch */ | ||
965 | if (batch->nr_batched_regions < max_batch_size && | ||
966 | (batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) { | ||
967 | list_add_tail(&hd->list, &batch->head->list); | ||
968 | batch->nr_batched_regions++; | ||
969 | hd = NULL; | ||
970 | } | ||
971 | |||
972 | /* Check if we should issue the current batch */ | ||
973 | if (batch->nr_batched_regions >= max_batch_size || hd) { | ||
974 | hydration_copy(batch->head, batch->nr_batched_regions); | ||
975 | batch->head = NULL; | ||
976 | batch->nr_batched_regions = 0; | ||
977 | } | ||
978 | } | ||
979 | |||
980 | if (!hd) | ||
981 | return; | ||
982 | |||
983 | /* We treat max batch sizes of zero and one equivalently */ | ||
984 | if (max_batch_size <= 1) { | ||
985 | hydration_copy(hd, 1); | ||
986 | return; | ||
987 | } | ||
988 | |||
989 | /* Start a new batch */ | ||
990 | BUG_ON(!list_empty(&hd->list)); | ||
991 | batch->head = hd; | ||
992 | batch->nr_batched_regions = 1; | ||
993 | } | ||
994 | |||
995 | static unsigned long __start_next_hydration(struct clone *clone, | ||
996 | unsigned long offset, | ||
997 | struct batch_info *batch) | ||
998 | { | ||
999 | unsigned long flags; | ||
1000 | struct hash_table_bucket *bucket; | ||
1001 | struct dm_clone_region_hydration *hd; | ||
1002 | unsigned long nr_regions = clone->nr_regions; | ||
1003 | |||
1004 | hd = alloc_hydration(clone); | ||
1005 | |||
1006 | /* Try to find a region to hydrate. */ | ||
1007 | do { | ||
1008 | offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset); | ||
1009 | if (offset == nr_regions) | ||
1010 | break; | ||
1011 | |||
1012 | bucket = get_hash_table_bucket(clone, offset); | ||
1013 | bucket_lock_irqsave(bucket, flags); | ||
1014 | |||
1015 | if (!dm_clone_is_region_hydrated(clone->cmd, offset) && | ||
1016 | !__hash_find(bucket, offset)) { | ||
1017 | hydration_init(hd, offset); | ||
1018 | __insert_region_hydration(bucket, hd); | ||
1019 | bucket_unlock_irqrestore(bucket, flags); | ||
1020 | |||
1021 | /* Batch hydration */ | ||
1022 | __batch_hydration(batch, hd); | ||
1023 | |||
1024 | return (offset + 1); | ||
1025 | } | ||
1026 | |||
1027 | bucket_unlock_irqrestore(bucket, flags); | ||
1028 | |||
1029 | } while (++offset < nr_regions); | ||
1030 | |||
1031 | if (hd) | ||
1032 | free_hydration(hd); | ||
1033 | |||
1034 | return offset; | ||
1035 | } | ||
1036 | |||
1037 | /* | ||
1038 | * This function searches for regions that still reside in the source device | ||
1039 | * and starts their hydration. | ||
1040 | */ | ||
1041 | static void do_hydration(struct clone *clone) | ||
1042 | { | ||
1043 | unsigned int current_volume; | ||
1044 | unsigned long offset, nr_regions = clone->nr_regions; | ||
1045 | |||
1046 | struct batch_info batch = { | ||
1047 | .head = NULL, | ||
1048 | .nr_batched_regions = 0, | ||
1049 | }; | ||
1050 | |||
1051 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) | ||
1052 | return; | ||
1053 | |||
1054 | if (dm_clone_is_hydration_done(clone->cmd)) | ||
1055 | return; | ||
1056 | |||
1057 | /* | ||
1058 | * Avoid race with device suspension. | ||
1059 | */ | ||
1060 | atomic_inc(&clone->hydrations_in_flight); | ||
1061 | |||
1062 | /* | ||
1063 | * Make sure atomic_inc() is ordered before test_bit(), otherwise we | ||
1064 | * might race with clone_postsuspend() and start a region hydration | ||
1065 | * after the target has been suspended. | ||
1066 | * | ||
1067 | * This is paired with the smp_mb__after_atomic() in | ||
1068 | * clone_postsuspend(). | ||
1069 | */ | ||
1070 | smp_mb__after_atomic(); | ||
1071 | |||
1072 | offset = clone->hydration_offset; | ||
1073 | while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) && | ||
1074 | !atomic_read(&clone->ios_in_flight) && | ||
1075 | test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) && | ||
1076 | offset < nr_regions) { | ||
1077 | current_volume = atomic_read(&clone->hydrations_in_flight); | ||
1078 | current_volume += batch.nr_batched_regions; | ||
1079 | |||
1080 | if (current_volume > READ_ONCE(clone->hydration_threshold)) | ||
1081 | break; | ||
1082 | |||
1083 | offset = __start_next_hydration(clone, offset, &batch); | ||
1084 | } | ||
1085 | |||
1086 | if (batch.head) | ||
1087 | hydration_copy(batch.head, batch.nr_batched_regions); | ||
1088 | |||
1089 | if (offset >= nr_regions) | ||
1090 | offset = 0; | ||
1091 | |||
1092 | clone->hydration_offset = offset; | ||
1093 | |||
1094 | if (atomic_dec_and_test(&clone->hydrations_in_flight)) | ||
1095 | wakeup_hydration_waiters(clone); | ||
1096 | } | ||
1097 | |||
1098 | /*---------------------------------------------------------------------------*/ | ||
1099 | |||
1100 | static bool need_commit_due_to_time(struct clone *clone) | ||
1101 | { | ||
1102 | return !time_in_range(jiffies, clone->last_commit_jiffies, | ||
1103 | clone->last_commit_jiffies + COMMIT_PERIOD); | ||
1104 | } | ||
1105 | |||
1106 | /* | ||
1107 | * A non-zero return indicates read-only or fail mode. | ||
1108 | */ | ||
1109 | static int commit_metadata(struct clone *clone) | ||
1110 | { | ||
1111 | int r = 0; | ||
1112 | |||
1113 | mutex_lock(&clone->commit_lock); | ||
1114 | |||
1115 | if (!dm_clone_changed_this_transaction(clone->cmd)) | ||
1116 | goto out; | ||
1117 | |||
1118 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { | ||
1119 | r = -EPERM; | ||
1120 | goto out; | ||
1121 | } | ||
1122 | |||
1123 | r = dm_clone_metadata_commit(clone->cmd); | ||
1124 | |||
1125 | if (unlikely(r)) { | ||
1126 | __metadata_operation_failed(clone, "dm_clone_metadata_commit", r); | ||
1127 | goto out; | ||
1128 | } | ||
1129 | |||
1130 | if (dm_clone_is_hydration_done(clone->cmd)) | ||
1131 | dm_table_event(clone->ti->table); | ||
1132 | out: | ||
1133 | mutex_unlock(&clone->commit_lock); | ||
1134 | |||
1135 | return r; | ||
1136 | } | ||
1137 | |||
1138 | static void process_deferred_discards(struct clone *clone) | ||
1139 | { | ||
1140 | int r = -EPERM; | ||
1141 | struct bio *bio; | ||
1142 | struct blk_plug plug; | ||
1143 | unsigned long rs, re, flags; | ||
1144 | struct bio_list discards = BIO_EMPTY_LIST; | ||
1145 | |||
1146 | spin_lock_irqsave(&clone->lock, flags); | ||
1147 | bio_list_merge(&discards, &clone->deferred_discard_bios); | ||
1148 | bio_list_init(&clone->deferred_discard_bios); | ||
1149 | spin_unlock_irqrestore(&clone->lock, flags); | ||
1150 | |||
1151 | if (bio_list_empty(&discards)) | ||
1152 | return; | ||
1153 | |||
1154 | if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) | ||
1155 | goto out; | ||
1156 | |||
1157 | /* Update the metadata */ | ||
1158 | bio_list_for_each(bio, &discards) { | ||
1159 | bio_region_range(clone, bio, &rs, &re); | ||
1160 | /* | ||
1161 | * A discard request might cover regions that have been already | ||
1162 | * hydrated. There is no need to update the metadata for these | ||
1163 | * regions. | ||
1164 | */ | ||
1165 | r = dm_clone_cond_set_range(clone->cmd, rs, re - rs); | ||
1166 | |||
1167 | if (unlikely(r)) | ||
1168 | break; | ||
1169 | } | ||
1170 | out: | ||
1171 | blk_start_plug(&plug); | ||
1172 | while ((bio = bio_list_pop(&discards))) | ||
1173 | complete_discard_bio(clone, bio, r == 0); | ||
1174 | blk_finish_plug(&plug); | ||
1175 | } | ||
1176 | |||
1177 | static void process_deferred_bios(struct clone *clone) | ||
1178 | { | ||
1179 | unsigned long flags; | ||
1180 | struct bio_list bios = BIO_EMPTY_LIST; | ||
1181 | |||
1182 | spin_lock_irqsave(&clone->lock, flags); | ||
1183 | bio_list_merge(&bios, &clone->deferred_bios); | ||
1184 | bio_list_init(&clone->deferred_bios); | ||
1185 | spin_unlock_irqrestore(&clone->lock, flags); | ||
1186 | |||
1187 | if (bio_list_empty(&bios)) | ||
1188 | return; | ||
1189 | |||
1190 | submit_bios(&bios); | ||
1191 | } | ||
1192 | |||
1193 | static void process_deferred_flush_bios(struct clone *clone) | ||
1194 | { | ||
1195 | struct bio *bio; | ||
1196 | unsigned long flags; | ||
1197 | struct bio_list bios = BIO_EMPTY_LIST; | ||
1198 | struct bio_list bio_completions = BIO_EMPTY_LIST; | ||
1199 | |||
1200 | /* | ||
1201 | * If there are any deferred flush bios, we must commit the metadata | ||
1202 | * before issuing them or signaling their completion. | ||
1203 | */ | ||
1204 | spin_lock_irqsave(&clone->lock, flags); | ||
1205 | bio_list_merge(&bios, &clone->deferred_flush_bios); | ||
1206 | bio_list_init(&clone->deferred_flush_bios); | ||
1207 | |||
1208 | bio_list_merge(&bio_completions, &clone->deferred_flush_completions); | ||
1209 | bio_list_init(&clone->deferred_flush_completions); | ||
1210 | spin_unlock_irqrestore(&clone->lock, flags); | ||
1211 | |||
1212 | if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && | ||
1213 | !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone))) | ||
1214 | return; | ||
1215 | |||
1216 | if (commit_metadata(clone)) { | ||
1217 | bio_list_merge(&bios, &bio_completions); | ||
1218 | |||
1219 | while ((bio = bio_list_pop(&bios))) | ||
1220 | bio_io_error(bio); | ||
1221 | |||
1222 | return; | ||
1223 | } | ||
1224 | |||
1225 | clone->last_commit_jiffies = jiffies; | ||
1226 | |||
1227 | while ((bio = bio_list_pop(&bio_completions))) | ||
1228 | bio_endio(bio); | ||
1229 | |||
1230 | while ((bio = bio_list_pop(&bios))) | ||
1231 | generic_make_request(bio); | ||
1232 | } | ||
1233 | |||
1234 | static void do_worker(struct work_struct *work) | ||
1235 | { | ||
1236 | struct clone *clone = container_of(work, typeof(*clone), worker); | ||
1237 | |||
1238 | process_deferred_bios(clone); | ||
1239 | process_deferred_discards(clone); | ||
1240 | |||
1241 | /* | ||
1242 | * process_deferred_flush_bios(): | ||
1243 | * | ||
1244 | * - Commit metadata | ||
1245 | * | ||
1246 | * - Process deferred REQ_FUA completions | ||
1247 | * | ||
1248 | * - Process deferred REQ_PREFLUSH bios | ||
1249 | */ | ||
1250 | process_deferred_flush_bios(clone); | ||
1251 | |||
1252 | /* Background hydration */ | ||
1253 | do_hydration(clone); | ||
1254 | } | ||
1255 | |||
1256 | /* | ||
1257 | * Commit periodically so that not too much unwritten data builds up. | ||
1258 | * | ||
1259 | * Also, restart background hydration, if it has been stopped by in-flight I/O. | ||
1260 | */ | ||
1261 | static void do_waker(struct work_struct *work) | ||
1262 | { | ||
1263 | struct clone *clone = container_of(to_delayed_work(work), struct clone, waker); | ||
1264 | |||
1265 | wake_worker(clone); | ||
1266 | queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD); | ||
1267 | } | ||
1268 | |||
1269 | /*---------------------------------------------------------------------------*/ | ||
1270 | |||
1271 | /* | ||
1272 | * Target methods | ||
1273 | */ | ||
1274 | static int clone_map(struct dm_target *ti, struct bio *bio) | ||
1275 | { | ||
1276 | struct clone *clone = ti->private; | ||
1277 | unsigned long region_nr; | ||
1278 | |||
1279 | atomic_inc(&clone->ios_in_flight); | ||
1280 | |||
1281 | if (unlikely(get_clone_mode(clone) == CM_FAIL)) | ||
1282 | return DM_MAPIO_KILL; | ||
1283 | |||
1284 | /* | ||
1285 | * REQ_PREFLUSH bios carry no data: | ||
1286 | * | ||
1287 | * - Commit metadata, if changed | ||
1288 | * | ||
1289 | * - Pass down to destination device | ||
1290 | */ | ||
1291 | if (bio->bi_opf & REQ_PREFLUSH) { | ||
1292 | remap_and_issue(clone, bio); | ||
1293 | return DM_MAPIO_SUBMITTED; | ||
1294 | } | ||
1295 | |||
1296 | bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); | ||
1297 | |||
1298 | /* | ||
1299 | * dm-clone interprets discards and performs a fast hydration of the | ||
1300 | * discarded regions, i.e., we skip the copy from the source device and | ||
1301 | * just mark the regions as hydrated. | ||
1302 | */ | ||
1303 | if (bio_op(bio) == REQ_OP_DISCARD) { | ||
1304 | process_discard_bio(clone, bio); | ||
1305 | return DM_MAPIO_SUBMITTED; | ||
1306 | } | ||
1307 | |||
1308 | /* | ||
1309 | * If the bio's region is hydrated, redirect it to the destination | ||
1310 | * device. | ||
1311 | * | ||
1312 | * If the region is not hydrated and the bio is a READ, redirect it to | ||
1313 | * the source device. | ||
1314 | * | ||
1315 | * Else, defer WRITE bio until after its region has been hydrated and | ||
1316 | * start the region's hydration immediately. | ||
1317 | */ | ||
1318 | region_nr = bio_to_region(clone, bio); | ||
1319 | if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { | ||
1320 | remap_and_issue(clone, bio); | ||
1321 | return DM_MAPIO_SUBMITTED; | ||
1322 | } else if (bio_data_dir(bio) == READ) { | ||
1323 | remap_to_source(clone, bio); | ||
1324 | return DM_MAPIO_REMAPPED; | ||
1325 | } | ||
1326 | |||
1327 | remap_to_dest(clone, bio); | ||
1328 | hydrate_bio_region(clone, bio); | ||
1329 | |||
1330 | return DM_MAPIO_SUBMITTED; | ||
1331 | } | ||
1332 | |||
1333 | static int clone_endio(struct dm_target *ti, struct bio *bio, blk_status_t *error) | ||
1334 | { | ||
1335 | struct clone *clone = ti->private; | ||
1336 | |||
1337 | atomic_dec(&clone->ios_in_flight); | ||
1338 | |||
1339 | return DM_ENDIO_DONE; | ||
1340 | } | ||
1341 | |||
1342 | static void emit_flags(struct clone *clone, char *result, unsigned int maxlen, | ||
1343 | ssize_t *sz_ptr) | ||
1344 | { | ||
1345 | ssize_t sz = *sz_ptr; | ||
1346 | unsigned int count; | ||
1347 | |||
1348 | count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | ||
1349 | count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); | ||
1350 | |||
1351 | DMEMIT("%u ", count); | ||
1352 | |||
1353 | if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags)) | ||
1354 | DMEMIT("no_hydration "); | ||
1355 | |||
1356 | if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) | ||
1357 | DMEMIT("no_discard_passdown "); | ||
1358 | |||
1359 | *sz_ptr = sz; | ||
1360 | } | ||
1361 | |||
1362 | static void emit_core_args(struct clone *clone, char *result, | ||
1363 | unsigned int maxlen, ssize_t *sz_ptr) | ||
1364 | { | ||
1365 | ssize_t sz = *sz_ptr; | ||
1366 | unsigned int count = 4; | ||
1367 | |||
1368 | DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count, | ||
1369 | READ_ONCE(clone->hydration_threshold), | ||
1370 | READ_ONCE(clone->hydration_batch_size)); | ||
1371 | |||
1372 | *sz_ptr = sz; | ||
1373 | } | ||
1374 | |||
1375 | /* | ||
1376 | * Status format: | ||
1377 | * | ||
1378 | * <metadata block size> <#used metadata blocks>/<#total metadata blocks> | ||
1379 | * <clone region size> <#hydrated regions>/<#total regions> <#hydrating regions> | ||
1380 | * <#features> <features>* <#core args> <core args>* <clone metadata mode> | ||
1381 | */ | ||
1382 | static void clone_status(struct dm_target *ti, status_type_t type, | ||
1383 | unsigned int status_flags, char *result, | ||
1384 | unsigned int maxlen) | ||
1385 | { | ||
1386 | int r; | ||
1387 | unsigned int i; | ||
1388 | ssize_t sz = 0; | ||
1389 | dm_block_t nr_free_metadata_blocks = 0; | ||
1390 | dm_block_t nr_metadata_blocks = 0; | ||
1391 | char buf[BDEVNAME_SIZE]; | ||
1392 | struct clone *clone = ti->private; | ||
1393 | |||
1394 | switch (type) { | ||
1395 | case STATUSTYPE_INFO: | ||
1396 | if (get_clone_mode(clone) == CM_FAIL) { | ||
1397 | DMEMIT("Fail"); | ||
1398 | break; | ||
1399 | } | ||
1400 | |||
1401 | /* Commit to ensure statistics aren't out-of-date */ | ||
1402 | if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) | ||
1403 | (void) commit_metadata(clone); | ||
1404 | |||
1405 | r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks); | ||
1406 | |||
1407 | if (r) { | ||
1408 | DMERR("%s: dm_clone_get_free_metadata_block_count returned %d", | ||
1409 | clone_device_name(clone), r); | ||
1410 | goto error; | ||
1411 | } | ||
1412 | |||
1413 | r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks); | ||
1414 | |||
1415 | if (r) { | ||
1416 | DMERR("%s: dm_clone_get_metadata_dev_size returned %d", | ||
1417 | clone_device_name(clone), r); | ||
1418 | goto error; | ||
1419 | } | ||
1420 | |||
1421 | DMEMIT("%u %llu/%llu %llu %lu/%lu %u ", | ||
1422 | DM_CLONE_METADATA_BLOCK_SIZE, | ||
1423 | (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks), | ||
1424 | (unsigned long long)nr_metadata_blocks, | ||
1425 | (unsigned long long)clone->region_size, | ||
1426 | dm_clone_nr_of_hydrated_regions(clone->cmd), | ||
1427 | clone->nr_regions, | ||
1428 | atomic_read(&clone->hydrations_in_flight)); | ||
1429 | |||
1430 | emit_flags(clone, result, maxlen, &sz); | ||
1431 | emit_core_args(clone, result, maxlen, &sz); | ||
1432 | |||
1433 | switch (get_clone_mode(clone)) { | ||
1434 | case CM_WRITE: | ||
1435 | DMEMIT("rw"); | ||
1436 | break; | ||
1437 | case CM_READ_ONLY: | ||
1438 | DMEMIT("ro"); | ||
1439 | break; | ||
1440 | case CM_FAIL: | ||
1441 | DMEMIT("Fail"); | ||
1442 | } | ||
1443 | |||
1444 | break; | ||
1445 | |||
1446 | case STATUSTYPE_TABLE: | ||
1447 | format_dev_t(buf, clone->metadata_dev->bdev->bd_dev); | ||
1448 | DMEMIT("%s ", buf); | ||
1449 | |||
1450 | format_dev_t(buf, clone->dest_dev->bdev->bd_dev); | ||
1451 | DMEMIT("%s ", buf); | ||
1452 | |||
1453 | format_dev_t(buf, clone->source_dev->bdev->bd_dev); | ||
1454 | DMEMIT("%s", buf); | ||
1455 | |||
1456 | for (i = 0; i < clone->nr_ctr_args; i++) | ||
1457 | DMEMIT(" %s", clone->ctr_args[i]); | ||
1458 | } | ||
1459 | |||
1460 | return; | ||
1461 | |||
1462 | error: | ||
1463 | DMEMIT("Error"); | ||
1464 | } | ||
1465 | |||
1466 | static int clone_is_congested(struct dm_target_callbacks *cb, int bdi_bits) | ||
1467 | { | ||
1468 | struct request_queue *dest_q, *source_q; | ||
1469 | struct clone *clone = container_of(cb, struct clone, callbacks); | ||
1470 | |||
1471 | source_q = bdev_get_queue(clone->source_dev->bdev); | ||
1472 | dest_q = bdev_get_queue(clone->dest_dev->bdev); | ||
1473 | |||
1474 | return (bdi_congested(dest_q->backing_dev_info, bdi_bits) | | ||
1475 | bdi_congested(source_q->backing_dev_info, bdi_bits)); | ||
1476 | } | ||
1477 | |||
1478 | static sector_t get_dev_size(struct dm_dev *dev) | ||
1479 | { | ||
1480 | return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; | ||
1481 | } | ||
1482 | |||
1483 | /*---------------------------------------------------------------------------*/ | ||
1484 | |||
1485 | /* | ||
1486 | * Construct a clone device mapping: | ||
1487 | * | ||
1488 | * clone <metadata dev> <destination dev> <source dev> <region size> | ||
1489 | * [<#feature args> [<feature arg>]* [<#core args> [key value]*]] | ||
1490 | * | ||
1491 | * metadata dev: Fast device holding the persistent metadata | ||
1492 | * destination dev: The destination device, which will become a clone of the | ||
1493 | * source device | ||
1494 | * source dev: The read-only source device that gets cloned | ||
1495 | * region size: dm-clone unit size in sectors | ||
1496 | * | ||
1497 | * #feature args: Number of feature arguments passed | ||
1498 | * feature args: E.g. no_hydration, no_discard_passdown | ||
1499 | * | ||
1500 | * #core arguments: An even number of core arguments | ||
1501 | * core arguments: Key/value pairs for tuning the core | ||
1502 | * E.g. 'hydration_threshold 256' | ||
1503 | */ | ||
1504 | static int parse_feature_args(struct dm_arg_set *as, struct clone *clone) | ||
1505 | { | ||
1506 | int r; | ||
1507 | unsigned int argc; | ||
1508 | const char *arg_name; | ||
1509 | struct dm_target *ti = clone->ti; | ||
1510 | |||
1511 | const struct dm_arg args = { | ||
1512 | .min = 0, | ||
1513 | .max = 2, | ||
1514 | .error = "Invalid number of feature arguments" | ||
1515 | }; | ||
1516 | |||
1517 | /* No feature arguments supplied */ | ||
1518 | if (!as->argc) | ||
1519 | return 0; | ||
1520 | |||
1521 | r = dm_read_arg_group(&args, as, &argc, &ti->error); | ||
1522 | if (r) | ||
1523 | return r; | ||
1524 | |||
1525 | while (argc) { | ||
1526 | arg_name = dm_shift_arg(as); | ||
1527 | argc--; | ||
1528 | |||
1529 | if (!strcasecmp(arg_name, "no_hydration")) { | ||
1530 | __clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | ||
1531 | } else if (!strcasecmp(arg_name, "no_discard_passdown")) { | ||
1532 | __clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); | ||
1533 | } else { | ||
1534 | ti->error = "Invalid feature argument"; | ||
1535 | return -EINVAL; | ||
1536 | } | ||
1537 | } | ||
1538 | |||
1539 | return 0; | ||
1540 | } | ||
1541 | |||
1542 | static int parse_core_args(struct dm_arg_set *as, struct clone *clone) | ||
1543 | { | ||
1544 | int r; | ||
1545 | unsigned int argc; | ||
1546 | unsigned int value; | ||
1547 | const char *arg_name; | ||
1548 | struct dm_target *ti = clone->ti; | ||
1549 | |||
1550 | const struct dm_arg args = { | ||
1551 | .min = 0, | ||
1552 | .max = 4, | ||
1553 | .error = "Invalid number of core arguments" | ||
1554 | }; | ||
1555 | |||
1556 | /* Initialize core arguments */ | ||
1557 | clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE; | ||
1558 | clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD; | ||
1559 | |||
1560 | /* No core arguments supplied */ | ||
1561 | if (!as->argc) | ||
1562 | return 0; | ||
1563 | |||
1564 | r = dm_read_arg_group(&args, as, &argc, &ti->error); | ||
1565 | if (r) | ||
1566 | return r; | ||
1567 | |||
1568 | if (argc & 1) { | ||
1569 | ti->error = "Number of core arguments must be even"; | ||
1570 | return -EINVAL; | ||
1571 | } | ||
1572 | |||
1573 | while (argc) { | ||
1574 | arg_name = dm_shift_arg(as); | ||
1575 | argc -= 2; | ||
1576 | |||
1577 | if (!strcasecmp(arg_name, "hydration_threshold")) { | ||
1578 | if (kstrtouint(dm_shift_arg(as), 10, &value)) { | ||
1579 | ti->error = "Invalid value for argument `hydration_threshold'"; | ||
1580 | return -EINVAL; | ||
1581 | } | ||
1582 | clone->hydration_threshold = value; | ||
1583 | } else if (!strcasecmp(arg_name, "hydration_batch_size")) { | ||
1584 | if (kstrtouint(dm_shift_arg(as), 10, &value)) { | ||
1585 | ti->error = "Invalid value for argument `hydration_batch_size'"; | ||
1586 | return -EINVAL; | ||
1587 | } | ||
1588 | clone->hydration_batch_size = value; | ||
1589 | } else { | ||
1590 | ti->error = "Invalid core argument"; | ||
1591 | return -EINVAL; | ||
1592 | } | ||
1593 | } | ||
1594 | |||
1595 | return 0; | ||
1596 | } | ||
1597 | |||
1598 | static int parse_region_size(struct clone *clone, struct dm_arg_set *as, char **error) | ||
1599 | { | ||
1600 | int r; | ||
1601 | unsigned int region_size; | ||
1602 | struct dm_arg arg; | ||
1603 | |||
1604 | arg.min = MIN_REGION_SIZE; | ||
1605 | arg.max = MAX_REGION_SIZE; | ||
1606 | arg.error = "Invalid region size"; | ||
1607 | |||
1608 | r = dm_read_arg(&arg, as, ®ion_size, error); | ||
1609 | if (r) | ||
1610 | return r; | ||
1611 | |||
1612 | /* Check region size is a power of 2 */ | ||
1613 | if (!is_power_of_2(region_size)) { | ||
1614 | *error = "Region size is not a power of 2"; | ||
1615 | return -EINVAL; | ||
1616 | } | ||
1617 | |||
1618 | /* Validate the region size against the device logical block size */ | ||
1619 | if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) || | ||
1620 | region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) { | ||
1621 | *error = "Region size is not a multiple of device logical block size"; | ||
1622 | return -EINVAL; | ||
1623 | } | ||
1624 | |||
1625 | clone->region_size = region_size; | ||
1626 | |||
1627 | return 0; | ||
1628 | } | ||
1629 | |||
1630 | static int validate_nr_regions(unsigned long n, char **error) | ||
1631 | { | ||
1632 | /* | ||
1633 | * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us | ||
1634 | * further to 2^31 regions. | ||
1635 | */ | ||
1636 | if (n > (1UL << 31)) { | ||
1637 | *error = "Too many regions. Consider increasing the region size"; | ||
1638 | return -EINVAL; | ||
1639 | } | ||
1640 | |||
1641 | return 0; | ||
1642 | } | ||
1643 | |||
1644 | static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char **error) | ||
1645 | { | ||
1646 | int r; | ||
1647 | sector_t metadata_dev_size; | ||
1648 | char b[BDEVNAME_SIZE]; | ||
1649 | |||
1650 | r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, | ||
1651 | &clone->metadata_dev); | ||
1652 | if (r) { | ||
1653 | *error = "Error opening metadata device"; | ||
1654 | return r; | ||
1655 | } | ||
1656 | |||
1657 | metadata_dev_size = get_dev_size(clone->metadata_dev); | ||
1658 | if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING) | ||
1659 | DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", | ||
1660 | bdevname(clone->metadata_dev->bdev, b), DM_CLONE_METADATA_MAX_SECTORS); | ||
1661 | |||
1662 | return 0; | ||
1663 | } | ||
1664 | |||
1665 | static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error) | ||
1666 | { | ||
1667 | int r; | ||
1668 | sector_t dest_dev_size; | ||
1669 | |||
1670 | r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, | ||
1671 | &clone->dest_dev); | ||
1672 | if (r) { | ||
1673 | *error = "Error opening destination device"; | ||
1674 | return r; | ||
1675 | } | ||
1676 | |||
1677 | dest_dev_size = get_dev_size(clone->dest_dev); | ||
1678 | if (dest_dev_size < clone->ti->len) { | ||
1679 | dm_put_device(clone->ti, clone->dest_dev); | ||
1680 | *error = "Device size larger than destination device"; | ||
1681 | return -EINVAL; | ||
1682 | } | ||
1683 | |||
1684 | return 0; | ||
1685 | } | ||
1686 | |||
1687 | static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error) | ||
1688 | { | ||
1689 | int r; | ||
1690 | sector_t source_dev_size; | ||
1691 | |||
1692 | r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ, | ||
1693 | &clone->source_dev); | ||
1694 | if (r) { | ||
1695 | *error = "Error opening source device"; | ||
1696 | return r; | ||
1697 | } | ||
1698 | |||
1699 | source_dev_size = get_dev_size(clone->source_dev); | ||
1700 | if (source_dev_size < clone->ti->len) { | ||
1701 | dm_put_device(clone->ti, clone->source_dev); | ||
1702 | *error = "Device size larger than source device"; | ||
1703 | return -EINVAL; | ||
1704 | } | ||
1705 | |||
1706 | return 0; | ||
1707 | } | ||
1708 | |||
1709 | static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char **error) | ||
1710 | { | ||
1711 | unsigned int i; | ||
1712 | const char **copy; | ||
1713 | |||
1714 | copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); | ||
1715 | if (!copy) | ||
1716 | goto error; | ||
1717 | |||
1718 | for (i = 0; i < argc; i++) { | ||
1719 | copy[i] = kstrdup(argv[i], GFP_KERNEL); | ||
1720 | |||
1721 | if (!copy[i]) { | ||
1722 | while (i--) | ||
1723 | kfree(copy[i]); | ||
1724 | kfree(copy); | ||
1725 | goto error; | ||
1726 | } | ||
1727 | } | ||
1728 | |||
1729 | clone->nr_ctr_args = argc; | ||
1730 | clone->ctr_args = copy; | ||
1731 | return 0; | ||
1732 | |||
1733 | error: | ||
1734 | *error = "Failed to allocate memory for table line"; | ||
1735 | return -ENOMEM; | ||
1736 | } | ||
1737 | |||
1738 | static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) | ||
1739 | { | ||
1740 | int r; | ||
1741 | struct clone *clone; | ||
1742 | struct dm_arg_set as; | ||
1743 | |||
1744 | if (argc < 4) { | ||
1745 | ti->error = "Invalid number of arguments"; | ||
1746 | return -EINVAL; | ||
1747 | } | ||
1748 | |||
1749 | as.argc = argc; | ||
1750 | as.argv = argv; | ||
1751 | |||
1752 | clone = kzalloc(sizeof(*clone), GFP_KERNEL); | ||
1753 | if (!clone) { | ||
1754 | ti->error = "Failed to allocate clone structure"; | ||
1755 | return -ENOMEM; | ||
1756 | } | ||
1757 | |||
1758 | clone->ti = ti; | ||
1759 | |||
1760 | /* Initialize dm-clone flags */ | ||
1761 | __set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | ||
1762 | __set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); | ||
1763 | __set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); | ||
1764 | |||
1765 | r = parse_metadata_dev(clone, &as, &ti->error); | ||
1766 | if (r) | ||
1767 | goto out_with_clone; | ||
1768 | |||
1769 | r = parse_dest_dev(clone, &as, &ti->error); | ||
1770 | if (r) | ||
1771 | goto out_with_meta_dev; | ||
1772 | |||
1773 | r = parse_source_dev(clone, &as, &ti->error); | ||
1774 | if (r) | ||
1775 | goto out_with_dest_dev; | ||
1776 | |||
1777 | r = parse_region_size(clone, &as, &ti->error); | ||
1778 | if (r) | ||
1779 | goto out_with_source_dev; | ||
1780 | |||
1781 | clone->region_shift = __ffs(clone->region_size); | ||
1782 | clone->nr_regions = dm_sector_div_up(ti->len, clone->region_size); | ||
1783 | |||
1784 | r = validate_nr_regions(clone->nr_regions, &ti->error); | ||
1785 | if (r) | ||
1786 | goto out_with_source_dev; | ||
1787 | |||
1788 | r = dm_set_target_max_io_len(ti, clone->region_size); | ||
1789 | if (r) { | ||
1790 | ti->error = "Failed to set max io len"; | ||
1791 | goto out_with_source_dev; | ||
1792 | } | ||
1793 | |||
1794 | r = parse_feature_args(&as, clone); | ||
1795 | if (r) | ||
1796 | goto out_with_source_dev; | ||
1797 | |||
1798 | r = parse_core_args(&as, clone); | ||
1799 | if (r) | ||
1800 | goto out_with_source_dev; | ||
1801 | |||
1802 | /* Load metadata */ | ||
1803 | clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len, | ||
1804 | clone->region_size); | ||
1805 | if (IS_ERR(clone->cmd)) { | ||
1806 | ti->error = "Failed to load metadata"; | ||
1807 | r = PTR_ERR(clone->cmd); | ||
1808 | goto out_with_source_dev; | ||
1809 | } | ||
1810 | |||
1811 | __set_clone_mode(clone, CM_WRITE); | ||
1812 | |||
1813 | if (get_clone_mode(clone) != CM_WRITE) { | ||
1814 | ti->error = "Unable to get write access to metadata, please check/repair metadata"; | ||
1815 | r = -EPERM; | ||
1816 | goto out_with_metadata; | ||
1817 | } | ||
1818 | |||
1819 | clone->last_commit_jiffies = jiffies; | ||
1820 | |||
1821 | /* Allocate hydration hash table */ | ||
1822 | r = hash_table_init(clone); | ||
1823 | if (r) { | ||
1824 | ti->error = "Failed to allocate hydration hash table"; | ||
1825 | goto out_with_metadata; | ||
1826 | } | ||
1827 | |||
1828 | atomic_set(&clone->ios_in_flight, 0); | ||
1829 | init_waitqueue_head(&clone->hydration_stopped); | ||
1830 | spin_lock_init(&clone->lock); | ||
1831 | bio_list_init(&clone->deferred_bios); | ||
1832 | bio_list_init(&clone->deferred_discard_bios); | ||
1833 | bio_list_init(&clone->deferred_flush_bios); | ||
1834 | bio_list_init(&clone->deferred_flush_completions); | ||
1835 | clone->hydration_offset = 0; | ||
1836 | atomic_set(&clone->hydrations_in_flight, 0); | ||
1837 | |||
1838 | clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); | ||
1839 | if (!clone->wq) { | ||
1840 | ti->error = "Failed to allocate workqueue"; | ||
1841 | r = -ENOMEM; | ||
1842 | goto out_with_ht; | ||
1843 | } | ||
1844 | |||
1845 | INIT_WORK(&clone->worker, do_worker); | ||
1846 | INIT_DELAYED_WORK(&clone->waker, do_waker); | ||
1847 | |||
1848 | clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); | ||
1849 | if (IS_ERR(clone->kcopyd_client)) { | ||
1850 | r = PTR_ERR(clone->kcopyd_client); | ||
1851 | goto out_with_wq; | ||
1852 | } | ||
1853 | |||
1854 | r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS, | ||
1855 | _hydration_cache); | ||
1856 | if (r) { | ||
1857 | ti->error = "Failed to create dm_clone_region_hydration memory pool"; | ||
1858 | goto out_with_kcopyd; | ||
1859 | } | ||
1860 | |||
1861 | /* Save a copy of the table line */ | ||
1862 | r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error); | ||
1863 | if (r) | ||
1864 | goto out_with_mempool; | ||
1865 | |||
1866 | mutex_init(&clone->commit_lock); | ||
1867 | clone->callbacks.congested_fn = clone_is_congested; | ||
1868 | dm_table_add_target_callbacks(ti->table, &clone->callbacks); | ||
1869 | |||
1870 | /* Enable flushes */ | ||
1871 | ti->num_flush_bios = 1; | ||
1872 | ti->flush_supported = true; | ||
1873 | |||
1874 | /* Enable discards */ | ||
1875 | ti->discards_supported = true; | ||
1876 | ti->num_discard_bios = 1; | ||
1877 | |||
1878 | ti->private = clone; | ||
1879 | |||
1880 | return 0; | ||
1881 | |||
1882 | out_with_mempool: | ||
1883 | mempool_exit(&clone->hydration_pool); | ||
1884 | out_with_kcopyd: | ||
1885 | dm_kcopyd_client_destroy(clone->kcopyd_client); | ||
1886 | out_with_wq: | ||
1887 | destroy_workqueue(clone->wq); | ||
1888 | out_with_ht: | ||
1889 | hash_table_exit(clone); | ||
1890 | out_with_metadata: | ||
1891 | dm_clone_metadata_close(clone->cmd); | ||
1892 | out_with_source_dev: | ||
1893 | dm_put_device(ti, clone->source_dev); | ||
1894 | out_with_dest_dev: | ||
1895 | dm_put_device(ti, clone->dest_dev); | ||
1896 | out_with_meta_dev: | ||
1897 | dm_put_device(ti, clone->metadata_dev); | ||
1898 | out_with_clone: | ||
1899 | kfree(clone); | ||
1900 | |||
1901 | return r; | ||
1902 | } | ||
1903 | |||
1904 | static void clone_dtr(struct dm_target *ti) | ||
1905 | { | ||
1906 | unsigned int i; | ||
1907 | struct clone *clone = ti->private; | ||
1908 | |||
1909 | mutex_destroy(&clone->commit_lock); | ||
1910 | |||
1911 | for (i = 0; i < clone->nr_ctr_args; i++) | ||
1912 | kfree(clone->ctr_args[i]); | ||
1913 | kfree(clone->ctr_args); | ||
1914 | |||
1915 | mempool_exit(&clone->hydration_pool); | ||
1916 | dm_kcopyd_client_destroy(clone->kcopyd_client); | ||
1917 | destroy_workqueue(clone->wq); | ||
1918 | hash_table_exit(clone); | ||
1919 | dm_clone_metadata_close(clone->cmd); | ||
1920 | dm_put_device(ti, clone->source_dev); | ||
1921 | dm_put_device(ti, clone->dest_dev); | ||
1922 | dm_put_device(ti, clone->metadata_dev); | ||
1923 | |||
1924 | kfree(clone); | ||
1925 | } | ||
1926 | |||
1927 | /*---------------------------------------------------------------------------*/ | ||
1928 | |||
1929 | static void clone_postsuspend(struct dm_target *ti) | ||
1930 | { | ||
1931 | struct clone *clone = ti->private; | ||
1932 | |||
1933 | /* | ||
1934 | * To successfully suspend the device: | ||
1935 | * | ||
1936 | * - We cancel the delayed work for periodic commits and wait for | ||
1937 | * it to finish. | ||
1938 | * | ||
1939 | * - We stop the background hydration, i.e. we prevent new region | ||
1940 | * hydrations from starting. | ||
1941 | * | ||
1942 | * - We wait for any in-flight hydrations to finish. | ||
1943 | * | ||
1944 | * - We flush the workqueue. | ||
1945 | * | ||
1946 | * - We commit the metadata. | ||
1947 | */ | ||
1948 | cancel_delayed_work_sync(&clone->waker); | ||
1949 | |||
1950 | set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); | ||
1951 | |||
1952 | /* | ||
1953 | * Make sure set_bit() is ordered before atomic_read(), otherwise we | ||
1954 | * might race with do_hydration() and miss some started region | ||
1955 | * hydrations. | ||
1956 | * | ||
1957 | * This is paired with smp_mb__after_atomic() in do_hydration(). | ||
1958 | */ | ||
1959 | smp_mb__after_atomic(); | ||
1960 | |||
1961 | wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight)); | ||
1962 | flush_workqueue(clone->wq); | ||
1963 | |||
1964 | (void) commit_metadata(clone); | ||
1965 | } | ||
1966 | |||
1967 | static void clone_resume(struct dm_target *ti) | ||
1968 | { | ||
1969 | struct clone *clone = ti->private; | ||
1970 | |||
1971 | clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags); | ||
1972 | do_waker(&clone->waker.work); | ||
1973 | } | ||
1974 | |||
1975 | static bool bdev_supports_discards(struct block_device *bdev) | ||
1976 | { | ||
1977 | struct request_queue *q = bdev_get_queue(bdev); | ||
1978 | |||
1979 | return (q && blk_queue_discard(q)); | ||
1980 | } | ||
1981 | |||
1982 | /* | ||
1983 | * If discard_passdown was enabled verify that the destination device supports | ||
1984 | * discards. Disable discard_passdown if not. | ||
1985 | */ | ||
1986 | static void disable_passdown_if_not_supported(struct clone *clone) | ||
1987 | { | ||
1988 | struct block_device *dest_dev = clone->dest_dev->bdev; | ||
1989 | struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits; | ||
1990 | const char *reason = NULL; | ||
1991 | char buf[BDEVNAME_SIZE]; | ||
1992 | |||
1993 | if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) | ||
1994 | return; | ||
1995 | |||
1996 | if (!bdev_supports_discards(dest_dev)) | ||
1997 | reason = "discard unsupported"; | ||
1998 | else if (dest_limits->max_discard_sectors < clone->region_size) | ||
1999 | reason = "max discard sectors smaller than a region"; | ||
2000 | |||
2001 | if (reason) { | ||
2002 | DMWARN("Destination device (%s) %s: Disabling discard passdown.", | ||
2003 | bdevname(dest_dev, buf), reason); | ||
2004 | clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags); | ||
2005 | } | ||
2006 | } | ||
2007 | |||
2008 | static void set_discard_limits(struct clone *clone, struct queue_limits *limits) | ||
2009 | { | ||
2010 | struct block_device *dest_bdev = clone->dest_dev->bdev; | ||
2011 | struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits; | ||
2012 | |||
2013 | if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) { | ||
2014 | /* No passdown is done so we set our own virtual limits */ | ||
2015 | limits->discard_granularity = clone->region_size << SECTOR_SHIFT; | ||
2016 | limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size); | ||
2017 | return; | ||
2018 | } | ||
2019 | |||
2020 | /* | ||
2021 | * clone_iterate_devices() is stacking both the source and destination | ||
2022 | * device limits but discards aren't passed to the source device, so | ||
2023 | * inherit destination's limits. | ||
2024 | */ | ||
2025 | limits->max_discard_sectors = dest_limits->max_discard_sectors; | ||
2026 | limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors; | ||
2027 | limits->discard_granularity = dest_limits->discard_granularity; | ||
2028 | limits->discard_alignment = dest_limits->discard_alignment; | ||
2029 | limits->discard_misaligned = dest_limits->discard_misaligned; | ||
2030 | limits->max_discard_segments = dest_limits->max_discard_segments; | ||
2031 | } | ||
2032 | |||
2033 | static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits) | ||
2034 | { | ||
2035 | struct clone *clone = ti->private; | ||
2036 | u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; | ||
2037 | |||
2038 | /* | ||
2039 | * If the system-determined stacked limits are compatible with | ||
2040 | * dm-clone's region size (io_opt is a factor) do not override them. | ||
2041 | */ | ||
2042 | if (io_opt_sectors < clone->region_size || | ||
2043 | do_div(io_opt_sectors, clone->region_size)) { | ||
2044 | blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT); | ||
2045 | blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT); | ||
2046 | } | ||
2047 | |||
2048 | disable_passdown_if_not_supported(clone); | ||
2049 | set_discard_limits(clone, limits); | ||
2050 | } | ||
2051 | |||
2052 | static int clone_iterate_devices(struct dm_target *ti, | ||
2053 | iterate_devices_callout_fn fn, void *data) | ||
2054 | { | ||
2055 | int ret; | ||
2056 | struct clone *clone = ti->private; | ||
2057 | struct dm_dev *dest_dev = clone->dest_dev; | ||
2058 | struct dm_dev *source_dev = clone->source_dev; | ||
2059 | |||
2060 | ret = fn(ti, source_dev, 0, ti->len, data); | ||
2061 | if (!ret) | ||
2062 | ret = fn(ti, dest_dev, 0, ti->len, data); | ||
2063 | return ret; | ||
2064 | } | ||
2065 | |||
2066 | /* | ||
2067 | * dm-clone message functions. | ||
2068 | */ | ||
2069 | static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions) | ||
2070 | { | ||
2071 | WRITE_ONCE(clone->hydration_threshold, nr_regions); | ||
2072 | |||
2073 | /* | ||
2074 | * If user space sets hydration_threshold to zero then the hydration | ||
2075 | * will stop. If at a later time the hydration_threshold is increased | ||
2076 | * we must restart the hydration process by waking up the worker. | ||
2077 | */ | ||
2078 | wake_worker(clone); | ||
2079 | } | ||
2080 | |||
2081 | static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions) | ||
2082 | { | ||
2083 | WRITE_ONCE(clone->hydration_batch_size, nr_regions); | ||
2084 | } | ||
2085 | |||
2086 | static void enable_hydration(struct clone *clone) | ||
2087 | { | ||
2088 | if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags)) | ||
2089 | wake_worker(clone); | ||
2090 | } | ||
2091 | |||
2092 | static void disable_hydration(struct clone *clone) | ||
2093 | { | ||
2094 | clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags); | ||
2095 | } | ||
2096 | |||
2097 | static int clone_message(struct dm_target *ti, unsigned int argc, char **argv, | ||
2098 | char *result, unsigned int maxlen) | ||
2099 | { | ||
2100 | struct clone *clone = ti->private; | ||
2101 | unsigned int value; | ||
2102 | |||
2103 | if (!argc) | ||
2104 | return -EINVAL; | ||
2105 | |||
2106 | if (!strcasecmp(argv[0], "enable_hydration")) { | ||
2107 | enable_hydration(clone); | ||
2108 | return 0; | ||
2109 | } | ||
2110 | |||
2111 | if (!strcasecmp(argv[0], "disable_hydration")) { | ||
2112 | disable_hydration(clone); | ||
2113 | return 0; | ||
2114 | } | ||
2115 | |||
2116 | if (argc != 2) | ||
2117 | return -EINVAL; | ||
2118 | |||
2119 | if (!strcasecmp(argv[0], "hydration_threshold")) { | ||
2120 | if (kstrtouint(argv[1], 10, &value)) | ||
2121 | return -EINVAL; | ||
2122 | |||
2123 | set_hydration_threshold(clone, value); | ||
2124 | |||
2125 | return 0; | ||
2126 | } | ||
2127 | |||
2128 | if (!strcasecmp(argv[0], "hydration_batch_size")) { | ||
2129 | if (kstrtouint(argv[1], 10, &value)) | ||
2130 | return -EINVAL; | ||
2131 | |||
2132 | set_hydration_batch_size(clone, value); | ||
2133 | |||
2134 | return 0; | ||
2135 | } | ||
2136 | |||
2137 | DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]); | ||
2138 | return -EINVAL; | ||
2139 | } | ||
2140 | |||
2141 | static struct target_type clone_target = { | ||
2142 | .name = "clone", | ||
2143 | .version = {1, 0, 0}, | ||
2144 | .module = THIS_MODULE, | ||
2145 | .ctr = clone_ctr, | ||
2146 | .dtr = clone_dtr, | ||
2147 | .map = clone_map, | ||
2148 | .end_io = clone_endio, | ||
2149 | .postsuspend = clone_postsuspend, | ||
2150 | .resume = clone_resume, | ||
2151 | .status = clone_status, | ||
2152 | .message = clone_message, | ||
2153 | .io_hints = clone_io_hints, | ||
2154 | .iterate_devices = clone_iterate_devices, | ||
2155 | }; | ||
2156 | |||
2157 | /*---------------------------------------------------------------------------*/ | ||
2158 | |||
2159 | /* Module functions */ | ||
2160 | static int __init dm_clone_init(void) | ||
2161 | { | ||
2162 | int r; | ||
2163 | |||
2164 | _hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0); | ||
2165 | if (!_hydration_cache) | ||
2166 | return -ENOMEM; | ||
2167 | |||
2168 | r = dm_register_target(&clone_target); | ||
2169 | if (r < 0) { | ||
2170 | DMERR("Failed to register clone target"); | ||
2171 | return r; | ||
2172 | } | ||
2173 | |||
2174 | return 0; | ||
2175 | } | ||
2176 | |||
2177 | static void __exit dm_clone_exit(void) | ||
2178 | { | ||
2179 | dm_unregister_target(&clone_target); | ||
2180 | |||
2181 | kmem_cache_destroy(_hydration_cache); | ||
2182 | _hydration_cache = NULL; | ||
2183 | } | ||
2184 | |||
2185 | /* Module hooks */ | ||
2186 | module_init(dm_clone_init); | ||
2187 | module_exit(dm_clone_exit); | ||
2188 | |||
2189 | MODULE_DESCRIPTION(DM_NAME " clone target"); | ||
2190 | MODULE_AUTHOR("Nikos Tsironis <ntsironis@arrikto.com>"); | ||
2191 | MODULE_LICENSE("GPL"); | ||