aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-06-12 21:12:08 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-06-12 21:12:08 -0400
commit4597fcff07044d89c646d0c5d8b42cd976d966a1 (patch)
treeace9a18c624e6ede7229d495aa5bad393daded92
parenta205f0c974db78c6a1a8ce31cd4c0b45ac45ea40 (diff)
parent48debafe4f2feabcc99f8e2659e80557e3ca6b39 (diff)
Merge tag 'for-4.18/dm-changes-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - Adjust various DM structure members to improve alignment relative to 4.18 block's mempool_t and bioset changes. - Add DM writecache target that offers writeback caching to persistent memory or SSD. - Small DM core error message change to give context for why a DM table type transition wasn't allowed. * tag 'for-4.18/dm-changes-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: dm: add writecache target dm: adjust structure members to improve alignment dm: report which conflicting type caused error during table_load()
-rw-r--r--Documentation/device-mapper/writecache.txt68
-rw-r--r--drivers/md/Kconfig11
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-bio-prison-v1.c2
-rw-r--r--drivers/md/dm-bio-prison-v2.c2
-rw-r--r--drivers/md/dm-cache-target.c61
-rw-r--r--drivers/md/dm-core.h38
-rw-r--r--drivers/md/dm-crypt.c26
-rw-r--r--drivers/md/dm-ioctl.c3
-rw-r--r--drivers/md/dm-kcopyd.c3
-rw-r--r--drivers/md/dm-region-hash.c13
-rw-r--r--drivers/md/dm-thin.c5
-rw-r--r--drivers/md/dm-writecache.c2305
-rw-r--r--drivers/md/dm-zoned-target.c2
14 files changed, 2466 insertions, 74 deletions
diff --git a/Documentation/device-mapper/writecache.txt b/Documentation/device-mapper/writecache.txt
new file mode 100644
index 000000000000..4424fa2c67d7
--- /dev/null
+++ b/Documentation/device-mapper/writecache.txt
@@ -0,0 +1,68 @@
1The writecache target caches writes on persistent memory or on SSD. It
2doesn't cache reads because reads are supposed to be cached in page cache
3in normal RAM.
4
5When the device is constructed, the first sector should be zeroed or the
6first sector should contain valid superblock from previous invocation.
7
8Constructor parameters:
91. type of the cache device - "p" or "s"
10 p - persistent memory
11 s - SSD
122. the underlying device that will be cached
133. the cache device
144. block size (4096 is recommended; the maximum block size is the page
15 size)
165. the number of optional parameters (the parameters with an argument
17 count as two)
18 high_watermark n (default: 50)
19 start writeback when the number of used blocks reach this
20 watermark
21 low_watermark x (default: 45)
22 stop writeback when the number of used blocks drops below
23 this watermark
24 writeback_jobs n (default: unlimited)
25 limit the number of blocks that are in flight during
26 writeback. Setting this value reduces writeback
27 throughput, but it may improve latency of read requests
28 autocommit_blocks n (default: 64 for pmem, 65536 for ssd)
29 when the application writes this amount of blocks without
30 issuing the FLUSH request, the blocks are automatically
31 commited
32 autocommit_time ms (default: 1000)
33 autocommit time in milliseconds. The data is automatically
34 commited if this time passes and no FLUSH request is
35 received
36 fua (by default on)
37 applicable only to persistent memory - use the FUA flag
38 when writing data from persistent memory back to the
39 underlying device
40 nofua
41 applicable only to persistent memory - don't use the FUA
42 flag when writing back data and send the FLUSH request
43 afterwards
44 - some underlying devices perform better with fua, some
45 with nofua. The user should test it
46
47Status:
481. error indicator - 0 if there was no error, otherwise error number
492. the number of blocks
503. the number of free blocks
514. the number of blocks under writeback
52
53Messages:
54 flush
55 flush the cache device. The message returns successfully
56 if the cache device was flushed without an error
57 flush_on_suspend
58 flush the cache device on next suspend. Use this message
59 when you are going to remove the cache device. The proper
60 sequence for removing the cache device is:
61 1. send the "flush_on_suspend" message
62 2. load an inactive table with a linear target that maps
63 to the underlying device
64 3. suspend the device
65 4. ask for status and verify that there are no errors
66 5. resume the device, so that it will use the linear
67 target
68 6. the cache device is now inactive and it can be deleted
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index edff083f7c4e..8b8c123cae66 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -334,6 +334,17 @@ config DM_CACHE_SMQ
334 of less memory utilization, improved performance and increased 334 of less memory utilization, improved performance and increased
335 adaptability in the face of changing workloads. 335 adaptability in the face of changing workloads.
336 336
337config DM_WRITECACHE
338 tristate "Writecache target"
339 depends on BLK_DEV_DM
340 ---help---
341 The writecache target caches writes on persistent memory or SSD.
342 It is intended for databases or other programs that need extremely
343 low commit latency.
344
345 The writecache target doesn't cache reads because reads are supposed
346 to be cached in standard RAM.
347
337config DM_ERA 348config DM_ERA
338 tristate "Era target (EXPERIMENTAL)" 349 tristate "Era target (EXPERIMENTAL)"
339 depends on BLK_DEV_DM 350 depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 63255f3ebd97..822f4e8753bc 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -67,6 +67,7 @@ obj-$(CONFIG_DM_ERA) += dm-era.o
67obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o 67obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
68obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o 68obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o
69obj-$(CONFIG_DM_ZONED) += dm-zoned.o 69obj-$(CONFIG_DM_ZONED) += dm-zoned.o
70obj-$(CONFIG_DM_WRITECACHE) += dm-writecache.o
70 71
71ifeq ($(CONFIG_DM_UEVENT),y) 72ifeq ($(CONFIG_DM_UEVENT),y)
72dm-mod-objs += dm-uevent.o 73dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index e794e3662fdd..b5389890bbc3 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -19,8 +19,8 @@
19 19
20struct dm_bio_prison { 20struct dm_bio_prison {
21 spinlock_t lock; 21 spinlock_t lock;
22 mempool_t cell_pool;
23 struct rb_root cells; 22 struct rb_root cells;
23 mempool_t cell_pool;
24}; 24};
25 25
26static struct kmem_cache *_cell_cache; 26static struct kmem_cache *_cell_cache;
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
index f866bc97b032..b092cdc8e1ae 100644
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -21,8 +21,8 @@ struct dm_bio_prison_v2 {
21 struct workqueue_struct *wq; 21 struct workqueue_struct *wq;
22 22
23 spinlock_t lock; 23 spinlock_t lock;
24 mempool_t cell_pool;
25 struct rb_root cells; 24 struct rb_root cells;
25 mempool_t cell_pool;
26}; 26};
27 27
28static struct kmem_cache *_cell_cache; 28static struct kmem_cache *_cell_cache;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 001c71248246..ce14a3d1f609 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -371,7 +371,13 @@ struct cache_stats {
371 371
372struct cache { 372struct cache {
373 struct dm_target *ti; 373 struct dm_target *ti;
374 struct dm_target_callbacks callbacks; 374 spinlock_t lock;
375
376 /*
377 * Fields for converting from sectors to blocks.
378 */
379 int sectors_per_block_shift;
380 sector_t sectors_per_block;
375 381
376 struct dm_cache_metadata *cmd; 382 struct dm_cache_metadata *cmd;
377 383
@@ -402,13 +408,11 @@ struct cache {
402 dm_cblock_t cache_size; 408 dm_cblock_t cache_size;
403 409
404 /* 410 /*
405 * Fields for converting from sectors to blocks. 411 * Invalidation fields.
406 */ 412 */
407 sector_t sectors_per_block; 413 spinlock_t invalidation_lock;
408 int sectors_per_block_shift; 414 struct list_head invalidation_requests;
409 415
410 spinlock_t lock;
411 struct bio_list deferred_bios;
412 sector_t migration_threshold; 416 sector_t migration_threshold;
413 wait_queue_head_t migration_wait; 417 wait_queue_head_t migration_wait;
414 atomic_t nr_allocated_migrations; 418 atomic_t nr_allocated_migrations;
@@ -419,13 +423,11 @@ struct cache {
419 */ 423 */
420 atomic_t nr_io_migrations; 424 atomic_t nr_io_migrations;
421 425
426 struct bio_list deferred_bios;
427
422 struct rw_semaphore quiesce_lock; 428 struct rw_semaphore quiesce_lock;
423 429
424 /* 430 struct dm_target_callbacks callbacks;
425 * cache_size entries, dirty if set
426 */
427 atomic_t nr_dirty;
428 unsigned long *dirty_bitset;
429 431
430 /* 432 /*
431 * origin_blocks entries, discarded if set. 433 * origin_blocks entries, discarded if set.
@@ -442,17 +444,27 @@ struct cache {
442 const char **ctr_args; 444 const char **ctr_args;
443 445
444 struct dm_kcopyd_client *copier; 446 struct dm_kcopyd_client *copier;
445 struct workqueue_struct *wq;
446 struct work_struct deferred_bio_worker; 447 struct work_struct deferred_bio_worker;
447 struct work_struct migration_worker; 448 struct work_struct migration_worker;
449 struct workqueue_struct *wq;
448 struct delayed_work waker; 450 struct delayed_work waker;
449 struct dm_bio_prison_v2 *prison; 451 struct dm_bio_prison_v2 *prison;
450 struct bio_set bs;
451 452
452 mempool_t migration_pool; 453 /*
454 * cache_size entries, dirty if set
455 */
456 unsigned long *dirty_bitset;
457 atomic_t nr_dirty;
453 458
454 struct dm_cache_policy *policy;
455 unsigned policy_nr_args; 459 unsigned policy_nr_args;
460 struct dm_cache_policy *policy;
461
462 /*
463 * Cache features such as write-through.
464 */
465 struct cache_features features;
466
467 struct cache_stats stats;
456 468
457 bool need_tick_bio:1; 469 bool need_tick_bio:1;
458 bool sized:1; 470 bool sized:1;
@@ -461,25 +473,16 @@ struct cache {
461 bool loaded_mappings:1; 473 bool loaded_mappings:1;
462 bool loaded_discards:1; 474 bool loaded_discards:1;
463 475
464 /* 476 struct rw_semaphore background_work_lock;
465 * Cache features such as write-through.
466 */
467 struct cache_features features;
468
469 struct cache_stats stats;
470 477
471 /* 478 struct batcher committer;
472 * Invalidation fields. 479 struct work_struct commit_ws;
473 */
474 spinlock_t invalidation_lock;
475 struct list_head invalidation_requests;
476 480
477 struct io_tracker tracker; 481 struct io_tracker tracker;
478 482
479 struct work_struct commit_ws; 483 mempool_t migration_pool;
480 struct batcher committer;
481 484
482 struct rw_semaphore background_work_lock; 485 struct bio_set bs;
483}; 486};
484 487
485struct per_bio_data { 488struct per_bio_data {
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index f21c5d21bf1b..7d480c930eaf 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -31,6 +31,9 @@ struct dm_kobject_holder {
31struct mapped_device { 31struct mapped_device {
32 struct mutex suspend_lock; 32 struct mutex suspend_lock;
33 33
34 struct mutex table_devices_lock;
35 struct list_head table_devices;
36
34 /* 37 /*
35 * The current mapping (struct dm_table *). 38 * The current mapping (struct dm_table *).
36 * Use dm_get_live_table{_fast} or take suspend_lock for 39 * Use dm_get_live_table{_fast} or take suspend_lock for
@@ -38,17 +41,14 @@ struct mapped_device {
38 */ 41 */
39 void __rcu *map; 42 void __rcu *map;
40 43
41 struct list_head table_devices;
42 struct mutex table_devices_lock;
43
44 unsigned long flags; 44 unsigned long flags;
45 45
46 struct request_queue *queue;
47 int numa_node_id;
48
49 enum dm_queue_mode type;
50 /* Protect queue and type against concurrent access. */ 46 /* Protect queue and type against concurrent access. */
51 struct mutex type_lock; 47 struct mutex type_lock;
48 enum dm_queue_mode type;
49
50 int numa_node_id;
51 struct request_queue *queue;
52 52
53 atomic_t holders; 53 atomic_t holders;
54 atomic_t open_count; 54 atomic_t open_count;
@@ -56,21 +56,21 @@ struct mapped_device {
56 struct dm_target *immutable_target; 56 struct dm_target *immutable_target;
57 struct target_type *immutable_target_type; 57 struct target_type *immutable_target_type;
58 58
59 char name[16];
59 struct gendisk *disk; 60 struct gendisk *disk;
60 struct dax_device *dax_dev; 61 struct dax_device *dax_dev;
61 char name[16];
62
63 void *interface_ptr;
64 62
65 /* 63 /*
66 * A list of ios that arrived while we were suspended. 64 * A list of ios that arrived while we were suspended.
67 */ 65 */
68 atomic_t pending[2];
69 wait_queue_head_t wait;
70 struct work_struct work; 66 struct work_struct work;
67 wait_queue_head_t wait;
68 atomic_t pending[2];
71 spinlock_t deferred_lock; 69 spinlock_t deferred_lock;
72 struct bio_list deferred; 70 struct bio_list deferred;
73 71
72 void *interface_ptr;
73
74 /* 74 /*
75 * Event handling. 75 * Event handling.
76 */ 76 */
@@ -84,17 +84,17 @@ struct mapped_device {
84 unsigned internal_suspend_count; 84 unsigned internal_suspend_count;
85 85
86 /* 86 /*
87 * Processing queue (flush)
88 */
89 struct workqueue_struct *wq;
90
91 /*
92 * io objects are allocated from here. 87 * io objects are allocated from here.
93 */ 88 */
94 struct bio_set io_bs; 89 struct bio_set io_bs;
95 struct bio_set bs; 90 struct bio_set bs;
96 91
97 /* 92 /*
93 * Processing queue (flush)
94 */
95 struct workqueue_struct *wq;
96
97 /*
98 * freeze/thaw support require holding onto a super block 98 * freeze/thaw support require holding onto a super block
99 */ 99 */
100 struct super_block *frozen_sb; 100 struct super_block *frozen_sb;
@@ -102,11 +102,11 @@ struct mapped_device {
102 /* forced geometry settings */ 102 /* forced geometry settings */
103 struct hd_geometry geometry; 103 struct hd_geometry geometry;
104 104
105 struct block_device *bdev;
106
107 /* kobject and completion */ 105 /* kobject and completion */
108 struct dm_kobject_holder kobj_holder; 106 struct dm_kobject_holder kobj_holder;
109 107
108 struct block_device *bdev;
109
110 /* zero-length flush that will be cloned and submitted to targets */ 110 /* zero-length flush that will be cloned and submitted to targets */
111 struct bio flush_bio; 111 struct bio flush_bio;
112 112
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index da02f4d8e4b9..4939fbc34ff2 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -139,25 +139,13 @@ struct crypt_config {
139 struct dm_dev *dev; 139 struct dm_dev *dev;
140 sector_t start; 140 sector_t start;
141 141
142 /*
143 * pool for per bio private data, crypto requests,
144 * encryption requeusts/buffer pages and integrity tags
145 */
146 mempool_t req_pool;
147 mempool_t page_pool;
148 mempool_t tag_pool;
149 unsigned tag_pool_max_sectors;
150
151 struct percpu_counter n_allocated_pages; 142 struct percpu_counter n_allocated_pages;
152 143
153 struct bio_set bs;
154 struct mutex bio_alloc_lock;
155
156 struct workqueue_struct *io_queue; 144 struct workqueue_struct *io_queue;
157 struct workqueue_struct *crypt_queue; 145 struct workqueue_struct *crypt_queue;
158 146
159 struct task_struct *write_thread;
160 wait_queue_head_t write_thread_wait; 147 wait_queue_head_t write_thread_wait;
148 struct task_struct *write_thread;
161 struct rb_root write_tree; 149 struct rb_root write_tree;
162 150
163 char *cipher; 151 char *cipher;
@@ -213,6 +201,18 @@ struct crypt_config {
213 unsigned int integrity_iv_size; 201 unsigned int integrity_iv_size;
214 unsigned int on_disk_tag_size; 202 unsigned int on_disk_tag_size;
215 203
204 /*
205 * pool for per bio private data, crypto requests,
206 * encryption requeusts/buffer pages and integrity tags
207 */
208 unsigned tag_pool_max_sectors;
209 mempool_t tag_pool;
210 mempool_t req_pool;
211 mempool_t page_pool;
212
213 struct bio_set bs;
214 struct mutex bio_alloc_lock;
215
216 u8 *authenc_key; /* space for keys in authenc() format (if used) */ 216 u8 *authenc_key; /* space for keys in authenc() format (if used) */
217 u8 key[0]; 217 u8 key[0];
218}; 218};
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 5acf77de5945..b810ea77e6b1 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1344,7 +1344,8 @@ static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si
1344 goto err_unlock_md_type; 1344 goto err_unlock_md_type;
1345 } 1345 }
1346 } else if (!is_valid_type(dm_get_md_type(md), dm_table_get_type(t))) { 1346 } else if (!is_valid_type(dm_get_md_type(md), dm_table_get_type(t))) {
1347 DMWARN("can't change device type after initial table load."); 1347 DMWARN("can't change device type (old=%u vs new=%u) after initial table load.",
1348 dm_get_md_type(md), dm_table_get_type(t));
1348 r = -EINVAL; 1349 r = -EINVAL;
1349 goto err_unlock_md_type; 1350 goto err_unlock_md_type;
1350 } 1351 }
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index ce7efc7434be..3c7547a3c371 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -45,7 +45,6 @@ struct dm_kcopyd_client {
45 struct dm_io_client *io_client; 45 struct dm_io_client *io_client;
46 46
47 wait_queue_head_t destroyq; 47 wait_queue_head_t destroyq;
48 atomic_t nr_jobs;
49 48
50 mempool_t job_pool; 49 mempool_t job_pool;
51 50
@@ -54,6 +53,8 @@ struct dm_kcopyd_client {
54 53
55 struct dm_kcopyd_throttle *throttle; 54 struct dm_kcopyd_throttle *throttle;
56 55
56 atomic_t nr_jobs;
57
57/* 58/*
58 * We maintain three lists of jobs: 59 * We maintain three lists of jobs:
59 * 60 *
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index abf3521b80a8..c832ec398f02 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -63,27 +63,28 @@ struct dm_region_hash {
63 63
64 /* hash table */ 64 /* hash table */
65 rwlock_t hash_lock; 65 rwlock_t hash_lock;
66 mempool_t region_pool;
67 unsigned mask; 66 unsigned mask;
68 unsigned nr_buckets; 67 unsigned nr_buckets;
69 unsigned prime; 68 unsigned prime;
70 unsigned shift; 69 unsigned shift;
71 struct list_head *buckets; 70 struct list_head *buckets;
72 71
72 /*
73 * If there was a flush failure no regions can be marked clean.
74 */
75 int flush_failure;
76
73 unsigned max_recovery; /* Max # of regions to recover in parallel */ 77 unsigned max_recovery; /* Max # of regions to recover in parallel */
74 78
75 spinlock_t region_lock; 79 spinlock_t region_lock;
76 atomic_t recovery_in_flight; 80 atomic_t recovery_in_flight;
77 struct semaphore recovery_count;
78 struct list_head clean_regions; 81 struct list_head clean_regions;
79 struct list_head quiesced_regions; 82 struct list_head quiesced_regions;
80 struct list_head recovered_regions; 83 struct list_head recovered_regions;
81 struct list_head failed_recovered_regions; 84 struct list_head failed_recovered_regions;
85 struct semaphore recovery_count;
82 86
83 /* 87 mempool_t region_pool;
84 * If there was a flush failure no regions can be marked clean.
85 */
86 int flush_failure;
87 88
88 void *context; 89 void *context;
89 sector_t target_begin; 90 sector_t target_begin;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 5772756c63c1..6cf9c9364103 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -240,9 +240,9 @@ struct pool {
240 struct dm_bio_prison *prison; 240 struct dm_bio_prison *prison;
241 struct dm_kcopyd_client *copier; 241 struct dm_kcopyd_client *copier;
242 242
243 struct work_struct worker;
243 struct workqueue_struct *wq; 244 struct workqueue_struct *wq;
244 struct throttle throttle; 245 struct throttle throttle;
245 struct work_struct worker;
246 struct delayed_work waker; 246 struct delayed_work waker;
247 struct delayed_work no_space_timeout; 247 struct delayed_work no_space_timeout;
248 248
@@ -260,7 +260,6 @@ struct pool {
260 struct dm_deferred_set *all_io_ds; 260 struct dm_deferred_set *all_io_ds;
261 261
262 struct dm_thin_new_mapping *next_mapping; 262 struct dm_thin_new_mapping *next_mapping;
263 mempool_t mapping_pool;
264 263
265 process_bio_fn process_bio; 264 process_bio_fn process_bio;
266 process_bio_fn process_discard; 265 process_bio_fn process_discard;
@@ -273,6 +272,8 @@ struct pool {
273 process_mapping_fn process_prepared_discard_pt2; 272 process_mapping_fn process_prepared_discard_pt2;
274 273
275 struct dm_bio_prison_cell **cell_sort_array; 274 struct dm_bio_prison_cell **cell_sort_array;
275
276 mempool_t mapping_pool;
276}; 277};
277 278
278static enum pool_mode get_pool_mode(struct pool *pool); 279static enum pool_mode get_pool_mode(struct pool *pool);
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
new file mode 100644
index 000000000000..5961c7794ef3
--- /dev/null
+++ b/drivers/md/dm-writecache.c
@@ -0,0 +1,2305 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2018 Red Hat. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include <linux/device-mapper.h>
9#include <linux/module.h>
10#include <linux/init.h>
11#include <linux/vmalloc.h>
12#include <linux/kthread.h>
13#include <linux/dm-io.h>
14#include <linux/dm-kcopyd.h>
15#include <linux/dax.h>
16#include <linux/pfn_t.h>
17#include <linux/libnvdimm.h>
18
19#define DM_MSG_PREFIX "writecache"
20
21#define HIGH_WATERMARK 50
22#define LOW_WATERMARK 45
23#define MAX_WRITEBACK_JOBS 0
24#define ENDIO_LATENCY 16
25#define WRITEBACK_LATENCY 64
26#define AUTOCOMMIT_BLOCKS_SSD 65536
27#define AUTOCOMMIT_BLOCKS_PMEM 64
28#define AUTOCOMMIT_MSEC 1000
29
30#define BITMAP_GRANULARITY 65536
31#if BITMAP_GRANULARITY < PAGE_SIZE
32#undef BITMAP_GRANULARITY
33#define BITMAP_GRANULARITY PAGE_SIZE
34#endif
35
36#if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
37#define DM_WRITECACHE_HAS_PMEM
38#endif
39
40#ifdef DM_WRITECACHE_HAS_PMEM
41#define pmem_assign(dest, src) \
42do { \
43 typeof(dest) uniq = (src); \
44 memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \
45} while (0)
46#else
47#define pmem_assign(dest, src) ((dest) = (src))
48#endif
49
50#if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM)
51#define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
52#endif
53
54#define MEMORY_SUPERBLOCK_MAGIC 0x23489321
55#define MEMORY_SUPERBLOCK_VERSION 1
56
57struct wc_memory_entry {
58 __le64 original_sector;
59 __le64 seq_count;
60};
61
62struct wc_memory_superblock {
63 union {
64 struct {
65 __le32 magic;
66 __le32 version;
67 __le32 block_size;
68 __le32 pad;
69 __le64 n_blocks;
70 __le64 seq_count;
71 };
72 __le64 padding[8];
73 };
74 struct wc_memory_entry entries[0];
75};
76
77struct wc_entry {
78 struct rb_node rb_node;
79 struct list_head lru;
80 unsigned short wc_list_contiguous;
81 bool write_in_progress
82#if BITS_PER_LONG == 64
83 :1
84#endif
85 ;
86 unsigned long index
87#if BITS_PER_LONG == 64
88 :47
89#endif
90 ;
91#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
92 uint64_t original_sector;
93 uint64_t seq_count;
94#endif
95};
96
97#ifdef DM_WRITECACHE_HAS_PMEM
98#define WC_MODE_PMEM(wc) ((wc)->pmem_mode)
99#define WC_MODE_FUA(wc) ((wc)->writeback_fua)
100#else
101#define WC_MODE_PMEM(wc) false
102#define WC_MODE_FUA(wc) false
103#endif
104#define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc))
105
106struct dm_writecache {
107 struct mutex lock;
108 struct list_head lru;
109 union {
110 struct list_head freelist;
111 struct {
112 struct rb_root freetree;
113 struct wc_entry *current_free;
114 };
115 };
116 struct rb_root tree;
117
118 size_t freelist_size;
119 size_t writeback_size;
120 size_t freelist_high_watermark;
121 size_t freelist_low_watermark;
122
123 unsigned uncommitted_blocks;
124 unsigned autocommit_blocks;
125 unsigned max_writeback_jobs;
126
127 int error;
128
129 unsigned long autocommit_jiffies;
130 struct timer_list autocommit_timer;
131 struct wait_queue_head freelist_wait;
132
133 atomic_t bio_in_progress[2];
134 struct wait_queue_head bio_in_progress_wait[2];
135
136 struct dm_target *ti;
137 struct dm_dev *dev;
138 struct dm_dev *ssd_dev;
139 void *memory_map;
140 uint64_t memory_map_size;
141 size_t metadata_sectors;
142 size_t n_blocks;
143 uint64_t seq_count;
144 void *block_start;
145 struct wc_entry *entries;
146 unsigned block_size;
147 unsigned char block_size_bits;
148
149 bool pmem_mode:1;
150 bool writeback_fua:1;
151
152 bool overwrote_committed:1;
153 bool memory_vmapped:1;
154
155 bool high_wm_percent_set:1;
156 bool low_wm_percent_set:1;
157 bool max_writeback_jobs_set:1;
158 bool autocommit_blocks_set:1;
159 bool autocommit_time_set:1;
160 bool writeback_fua_set:1;
161 bool flush_on_suspend:1;
162
163 unsigned writeback_all;
164 struct workqueue_struct *writeback_wq;
165 struct work_struct writeback_work;
166 struct work_struct flush_work;
167
168 struct dm_io_client *dm_io;
169
170 raw_spinlock_t endio_list_lock;
171 struct list_head endio_list;
172 struct task_struct *endio_thread;
173
174 struct task_struct *flush_thread;
175 struct bio_list flush_list;
176
177 struct dm_kcopyd_client *dm_kcopyd;
178 unsigned long *dirty_bitmap;
179 unsigned dirty_bitmap_size;
180
181 struct bio_set bio_set;
182 mempool_t copy_pool;
183};
184
185#define WB_LIST_INLINE 16
186
187struct writeback_struct {
188 struct list_head endio_entry;
189 struct dm_writecache *wc;
190 struct wc_entry **wc_list;
191 unsigned wc_list_n;
192 unsigned page_offset;
193 struct page *page;
194 struct wc_entry *wc_list_inline[WB_LIST_INLINE];
195 struct bio bio;
196};
197
198struct copy_struct {
199 struct list_head endio_entry;
200 struct dm_writecache *wc;
201 struct wc_entry *e;
202 unsigned n_entries;
203 int error;
204};
205
206DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
207 "A percentage of time allocated for data copying");
208
209static void wc_lock(struct dm_writecache *wc)
210{
211 mutex_lock(&wc->lock);
212}
213
214static void wc_unlock(struct dm_writecache *wc)
215{
216 mutex_unlock(&wc->lock);
217}
218
219#ifdef DM_WRITECACHE_HAS_PMEM
220static int persistent_memory_claim(struct dm_writecache *wc)
221{
222 int r;
223 loff_t s;
224 long p, da;
225 pfn_t pfn;
226 int id;
227 struct page **pages;
228
229 wc->memory_vmapped = false;
230
231 if (!wc->ssd_dev->dax_dev) {
232 r = -EOPNOTSUPP;
233 goto err1;
234 }
235 s = wc->memory_map_size;
236 p = s >> PAGE_SHIFT;
237 if (!p) {
238 r = -EINVAL;
239 goto err1;
240 }
241 if (p != s >> PAGE_SHIFT) {
242 r = -EOVERFLOW;
243 goto err1;
244 }
245
246 id = dax_read_lock();
247
248 da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn);
249 if (da < 0) {
250 wc->memory_map = NULL;
251 r = da;
252 goto err2;
253 }
254 if (!pfn_t_has_page(pfn)) {
255 wc->memory_map = NULL;
256 r = -EOPNOTSUPP;
257 goto err2;
258 }
259 if (da != p) {
260 long i;
261 wc->memory_map = NULL;
262 pages = kvmalloc(p * sizeof(struct page *), GFP_KERNEL);
263 if (!pages) {
264 r = -ENOMEM;
265 goto err2;
266 }
267 i = 0;
268 do {
269 long daa;
270 void *dummy_addr;
271 daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i,
272 &dummy_addr, &pfn);
273 if (daa <= 0) {
274 r = daa ? daa : -EINVAL;
275 goto err3;
276 }
277 if (!pfn_t_has_page(pfn)) {
278 r = -EOPNOTSUPP;
279 goto err3;
280 }
281 while (daa-- && i < p) {
282 pages[i++] = pfn_t_to_page(pfn);
283 pfn.val++;
284 }
285 } while (i < p);
286 wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
287 if (!wc->memory_map) {
288 r = -ENOMEM;
289 goto err3;
290 }
291 kvfree(pages);
292 wc->memory_vmapped = true;
293 }
294
295 dax_read_unlock(id);
296 return 0;
297err3:
298 kvfree(pages);
299err2:
300 dax_read_unlock(id);
301err1:
302 return r;
303}
304#else
305static int persistent_memory_claim(struct dm_writecache *wc)
306{
307 BUG();
308}
309#endif
310
311static void persistent_memory_release(struct dm_writecache *wc)
312{
313 if (wc->memory_vmapped)
314 vunmap(wc->memory_map);
315}
316
317static struct page *persistent_memory_page(void *addr)
318{
319 if (is_vmalloc_addr(addr))
320 return vmalloc_to_page(addr);
321 else
322 return virt_to_page(addr);
323}
324
325static unsigned persistent_memory_page_offset(void *addr)
326{
327 return (unsigned long)addr & (PAGE_SIZE - 1);
328}
329
330static void persistent_memory_flush_cache(void *ptr, size_t size)
331{
332 if (is_vmalloc_addr(ptr))
333 flush_kernel_vmap_range(ptr, size);
334}
335
336static void persistent_memory_invalidate_cache(void *ptr, size_t size)
337{
338 if (is_vmalloc_addr(ptr))
339 invalidate_kernel_vmap_range(ptr, size);
340}
341
342static struct wc_memory_superblock *sb(struct dm_writecache *wc)
343{
344 return wc->memory_map;
345}
346
347static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
348{
349 if (is_power_of_2(sizeof(struct wc_entry)) && 0)
350 return &sb(wc)->entries[e - wc->entries];
351 else
352 return &sb(wc)->entries[e->index];
353}
354
355static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
356{
357 return (char *)wc->block_start + (e->index << wc->block_size_bits);
358}
359
360static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
361{
362 return wc->metadata_sectors +
363 ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
364}
365
366static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
367{
368#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
369 return e->original_sector;
370#else
371 return le64_to_cpu(memory_entry(wc, e)->original_sector);
372#endif
373}
374
375static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
376{
377#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
378 return e->seq_count;
379#else
380 return le64_to_cpu(memory_entry(wc, e)->seq_count);
381#endif
382}
383
384static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
385{
386#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
387 e->seq_count = -1;
388#endif
389 pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
390}
391
392static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
393 uint64_t original_sector, uint64_t seq_count)
394{
395 struct wc_memory_entry me;
396#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
397 e->original_sector = original_sector;
398 e->seq_count = seq_count;
399#endif
400 me.original_sector = cpu_to_le64(original_sector);
401 me.seq_count = cpu_to_le64(seq_count);
402 pmem_assign(*memory_entry(wc, e), me);
403}
404
405#define writecache_error(wc, err, msg, arg...) \
406do { \
407 if (!cmpxchg(&(wc)->error, 0, err)) \
408 DMERR(msg, ##arg); \
409 wake_up(&(wc)->freelist_wait); \
410} while (0)
411
412#define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error)))
413
414static void writecache_flush_all_metadata(struct dm_writecache *wc)
415{
416 if (!WC_MODE_PMEM(wc))
417 memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
418}
419
420static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
421{
422 if (!WC_MODE_PMEM(wc))
423 __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
424 wc->dirty_bitmap);
425}
426
427static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
428
429struct io_notify {
430 struct dm_writecache *wc;
431 struct completion c;
432 atomic_t count;
433};
434
435static void writecache_notify_io(unsigned long error, void *context)
436{
437 struct io_notify *endio = context;
438
439 if (unlikely(error != 0))
440 writecache_error(endio->wc, -EIO, "error writing metadata");
441 BUG_ON(atomic_read(&endio->count) <= 0);
442 if (atomic_dec_and_test(&endio->count))
443 complete(&endio->c);
444}
445
446static void ssd_commit_flushed(struct dm_writecache *wc)
447{
448 struct dm_io_region region;
449 struct dm_io_request req;
450 struct io_notify endio = {
451 wc,
452 COMPLETION_INITIALIZER_ONSTACK(endio.c),
453 ATOMIC_INIT(1),
454 };
455 unsigned bitmap_bits = wc->dirty_bitmap_size * BITS_PER_LONG;
456 unsigned i = 0;
457
458 while (1) {
459 unsigned j;
460 i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
461 if (unlikely(i == bitmap_bits))
462 break;
463 j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
464
465 region.bdev = wc->ssd_dev->bdev;
466 region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
467 region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
468
469 if (unlikely(region.sector >= wc->metadata_sectors))
470 break;
471 if (unlikely(region.sector + region.count > wc->metadata_sectors))
472 region.count = wc->metadata_sectors - region.sector;
473
474 atomic_inc(&endio.count);
475 req.bi_op = REQ_OP_WRITE;
476 req.bi_op_flags = REQ_SYNC;
477 req.mem.type = DM_IO_VMA;
478 req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
479 req.client = wc->dm_io;
480 req.notify.fn = writecache_notify_io;
481 req.notify.context = &endio;
482
483 /* writing via async dm-io (implied by notify.fn above) won't return an error */
484 (void) dm_io(&req, 1, &region, NULL);
485 i = j;
486 }
487
488 writecache_notify_io(0, &endio);
489 wait_for_completion_io(&endio.c);
490
491 writecache_disk_flush(wc, wc->ssd_dev);
492
493 memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
494}
495
496static void writecache_commit_flushed(struct dm_writecache *wc)
497{
498 if (WC_MODE_PMEM(wc))
499 wmb();
500 else
501 ssd_commit_flushed(wc);
502}
503
504static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
505{
506 int r;
507 struct dm_io_region region;
508 struct dm_io_request req;
509
510 region.bdev = dev->bdev;
511 region.sector = 0;
512 region.count = 0;
513 req.bi_op = REQ_OP_WRITE;
514 req.bi_op_flags = REQ_PREFLUSH;
515 req.mem.type = DM_IO_KMEM;
516 req.mem.ptr.addr = NULL;
517 req.client = wc->dm_io;
518 req.notify.fn = NULL;
519
520 r = dm_io(&req, 1, &region, NULL);
521 if (unlikely(r))
522 writecache_error(wc, r, "error flushing metadata: %d", r);
523}
524
525static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
526{
527 wait_event(wc->bio_in_progress_wait[direction],
528 !atomic_read(&wc->bio_in_progress[direction]));
529}
530
531#define WFE_RETURN_FOLLOWING 1
532#define WFE_LOWEST_SEQ 2
533
534static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
535 uint64_t block, int flags)
536{
537 struct wc_entry *e;
538 struct rb_node *node = wc->tree.rb_node;
539
540 if (unlikely(!node))
541 return NULL;
542
543 while (1) {
544 e = container_of(node, struct wc_entry, rb_node);
545 if (read_original_sector(wc, e) == block)
546 break;
547 node = (read_original_sector(wc, e) >= block ?
548 e->rb_node.rb_left : e->rb_node.rb_right);
549 if (unlikely(!node)) {
550 if (!(flags & WFE_RETURN_FOLLOWING)) {
551 return NULL;
552 }
553 if (read_original_sector(wc, e) >= block) {
554 break;
555 } else {
556 node = rb_next(&e->rb_node);
557 if (unlikely(!node)) {
558 return NULL;
559 }
560 e = container_of(node, struct wc_entry, rb_node);
561 break;
562 }
563 }
564 }
565
566 while (1) {
567 struct wc_entry *e2;
568 if (flags & WFE_LOWEST_SEQ)
569 node = rb_prev(&e->rb_node);
570 else
571 node = rb_next(&e->rb_node);
572 if (!node)
573 return e;
574 e2 = container_of(node, struct wc_entry, rb_node);
575 if (read_original_sector(wc, e2) != block)
576 return e;
577 e = e2;
578 }
579}
580
581static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
582{
583 struct wc_entry *e;
584 struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
585
586 while (*node) {
587 e = container_of(*node, struct wc_entry, rb_node);
588 parent = &e->rb_node;
589 if (read_original_sector(wc, e) > read_original_sector(wc, ins))
590 node = &parent->rb_left;
591 else
592 node = &parent->rb_right;
593 }
594 rb_link_node(&ins->rb_node, parent, node);
595 rb_insert_color(&ins->rb_node, &wc->tree);
596 list_add(&ins->lru, &wc->lru);
597}
598
599static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
600{
601 list_del(&e->lru);
602 rb_erase(&e->rb_node, &wc->tree);
603}
604
605static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
606{
607 if (WC_MODE_SORT_FREELIST(wc)) {
608 struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
609 if (unlikely(!*node))
610 wc->current_free = e;
611 while (*node) {
612 parent = *node;
613 if (&e->rb_node < *node)
614 node = &parent->rb_left;
615 else
616 node = &parent->rb_right;
617 }
618 rb_link_node(&e->rb_node, parent, node);
619 rb_insert_color(&e->rb_node, &wc->freetree);
620 } else {
621 list_add_tail(&e->lru, &wc->freelist);
622 }
623 wc->freelist_size++;
624}
625
626static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
627{
628 struct wc_entry *e;
629
630 if (WC_MODE_SORT_FREELIST(wc)) {
631 struct rb_node *next;
632 if (unlikely(!wc->current_free))
633 return NULL;
634 e = wc->current_free;
635 next = rb_next(&e->rb_node);
636 rb_erase(&e->rb_node, &wc->freetree);
637 if (unlikely(!next))
638 next = rb_first(&wc->freetree);
639 wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
640 } else {
641 if (unlikely(list_empty(&wc->freelist)))
642 return NULL;
643 e = container_of(wc->freelist.next, struct wc_entry, lru);
644 list_del(&e->lru);
645 }
646 wc->freelist_size--;
647 if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
648 queue_work(wc->writeback_wq, &wc->writeback_work);
649
650 return e;
651}
652
653static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
654{
655 writecache_unlink(wc, e);
656 writecache_add_to_freelist(wc, e);
657 clear_seq_count(wc, e);
658 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
659 if (unlikely(waitqueue_active(&wc->freelist_wait)))
660 wake_up(&wc->freelist_wait);
661}
662
663static void writecache_wait_on_freelist(struct dm_writecache *wc)
664{
665 DEFINE_WAIT(wait);
666
667 prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
668 wc_unlock(wc);
669 io_schedule();
670 finish_wait(&wc->freelist_wait, &wait);
671 wc_lock(wc);
672}
673
674static void writecache_poison_lists(struct dm_writecache *wc)
675{
676 /*
677 * Catch incorrect access to these values while the device is suspended.
678 */
679 memset(&wc->tree, -1, sizeof wc->tree);
680 wc->lru.next = LIST_POISON1;
681 wc->lru.prev = LIST_POISON2;
682 wc->freelist.next = LIST_POISON1;
683 wc->freelist.prev = LIST_POISON2;
684}
685
686static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
687{
688 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
689 if (WC_MODE_PMEM(wc))
690 writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
691}
692
693static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
694{
695 return read_seq_count(wc, e) < wc->seq_count;
696}
697
698static void writecache_flush(struct dm_writecache *wc)
699{
700 struct wc_entry *e, *e2;
701 bool need_flush_after_free;
702
703 wc->uncommitted_blocks = 0;
704 del_timer(&wc->autocommit_timer);
705
706 if (list_empty(&wc->lru))
707 return;
708
709 e = container_of(wc->lru.next, struct wc_entry, lru);
710 if (writecache_entry_is_committed(wc, e)) {
711 if (wc->overwrote_committed) {
712 writecache_wait_for_ios(wc, WRITE);
713 writecache_disk_flush(wc, wc->ssd_dev);
714 wc->overwrote_committed = false;
715 }
716 return;
717 }
718 while (1) {
719 writecache_flush_entry(wc, e);
720 if (unlikely(e->lru.next == &wc->lru))
721 break;
722 e2 = container_of(e->lru.next, struct wc_entry, lru);
723 if (writecache_entry_is_committed(wc, e2))
724 break;
725 e = e2;
726 cond_resched();
727 }
728 writecache_commit_flushed(wc);
729
730 writecache_wait_for_ios(wc, WRITE);
731
732 wc->seq_count++;
733 pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
734 writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
735 writecache_commit_flushed(wc);
736
737 wc->overwrote_committed = false;
738
739 need_flush_after_free = false;
740 while (1) {
741 /* Free another committed entry with lower seq-count */
742 struct rb_node *rb_node = rb_prev(&e->rb_node);
743
744 if (rb_node) {
745 e2 = container_of(rb_node, struct wc_entry, rb_node);
746 if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
747 likely(!e2->write_in_progress)) {
748 writecache_free_entry(wc, e2);
749 need_flush_after_free = true;
750 }
751 }
752 if (unlikely(e->lru.prev == &wc->lru))
753 break;
754 e = container_of(e->lru.prev, struct wc_entry, lru);
755 cond_resched();
756 }
757
758 if (need_flush_after_free)
759 writecache_commit_flushed(wc);
760}
761
762static void writecache_flush_work(struct work_struct *work)
763{
764 struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
765
766 wc_lock(wc);
767 writecache_flush(wc);
768 wc_unlock(wc);
769}
770
771static void writecache_autocommit_timer(struct timer_list *t)
772{
773 struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
774 if (!writecache_has_error(wc))
775 queue_work(wc->writeback_wq, &wc->flush_work);
776}
777
778static void writecache_schedule_autocommit(struct dm_writecache *wc)
779{
780 if (!timer_pending(&wc->autocommit_timer))
781 mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
782}
783
784static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
785{
786 struct wc_entry *e;
787 bool discarded_something = false;
788
789 e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
790 if (unlikely(!e))
791 return;
792
793 while (read_original_sector(wc, e) < end) {
794 struct rb_node *node = rb_next(&e->rb_node);
795
796 if (likely(!e->write_in_progress)) {
797 if (!discarded_something) {
798 writecache_wait_for_ios(wc, READ);
799 writecache_wait_for_ios(wc, WRITE);
800 discarded_something = true;
801 }
802 writecache_free_entry(wc, e);
803 }
804
805 if (!node)
806 break;
807
808 e = container_of(node, struct wc_entry, rb_node);
809 }
810
811 if (discarded_something)
812 writecache_commit_flushed(wc);
813}
814
815static bool writecache_wait_for_writeback(struct dm_writecache *wc)
816{
817 if (wc->writeback_size) {
818 writecache_wait_on_freelist(wc);
819 return true;
820 }
821 return false;
822}
823
824static void writecache_suspend(struct dm_target *ti)
825{
826 struct dm_writecache *wc = ti->private;
827 bool flush_on_suspend;
828
829 del_timer_sync(&wc->autocommit_timer);
830
831 wc_lock(wc);
832 writecache_flush(wc);
833 flush_on_suspend = wc->flush_on_suspend;
834 if (flush_on_suspend) {
835 wc->flush_on_suspend = false;
836 wc->writeback_all++;
837 queue_work(wc->writeback_wq, &wc->writeback_work);
838 }
839 wc_unlock(wc);
840
841 flush_workqueue(wc->writeback_wq);
842
843 wc_lock(wc);
844 if (flush_on_suspend)
845 wc->writeback_all--;
846 while (writecache_wait_for_writeback(wc));
847
848 if (WC_MODE_PMEM(wc))
849 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
850
851 writecache_poison_lists(wc);
852
853 wc_unlock(wc);
854}
855
856static int writecache_alloc_entries(struct dm_writecache *wc)
857{
858 size_t b;
859
860 if (wc->entries)
861 return 0;
862 wc->entries = vmalloc(sizeof(struct wc_entry) * wc->n_blocks);
863 if (!wc->entries)
864 return -ENOMEM;
865 for (b = 0; b < wc->n_blocks; b++) {
866 struct wc_entry *e = &wc->entries[b];
867 e->index = b;
868 e->write_in_progress = false;
869 }
870
871 return 0;
872}
873
874static void writecache_resume(struct dm_target *ti)
875{
876 struct dm_writecache *wc = ti->private;
877 size_t b;
878 bool need_flush = false;
879 __le64 sb_seq_count;
880 int r;
881
882 wc_lock(wc);
883
884 if (WC_MODE_PMEM(wc))
885 persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
886
887 wc->tree = RB_ROOT;
888 INIT_LIST_HEAD(&wc->lru);
889 if (WC_MODE_SORT_FREELIST(wc)) {
890 wc->freetree = RB_ROOT;
891 wc->current_free = NULL;
892 } else {
893 INIT_LIST_HEAD(&wc->freelist);
894 }
895 wc->freelist_size = 0;
896
897 r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
898 if (r) {
899 writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
900 sb_seq_count = cpu_to_le64(0);
901 }
902 wc->seq_count = le64_to_cpu(sb_seq_count);
903
904#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
905 for (b = 0; b < wc->n_blocks; b++) {
906 struct wc_entry *e = &wc->entries[b];
907 struct wc_memory_entry wme;
908 if (writecache_has_error(wc)) {
909 e->original_sector = -1;
910 e->seq_count = -1;
911 continue;
912 }
913 r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry));
914 if (r) {
915 writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
916 (unsigned long)b, r);
917 e->original_sector = -1;
918 e->seq_count = -1;
919 } else {
920 e->original_sector = le64_to_cpu(wme.original_sector);
921 e->seq_count = le64_to_cpu(wme.seq_count);
922 }
923 }
924#endif
925 for (b = 0; b < wc->n_blocks; b++) {
926 struct wc_entry *e = &wc->entries[b];
927 if (!writecache_entry_is_committed(wc, e)) {
928 if (read_seq_count(wc, e) != -1) {
929erase_this:
930 clear_seq_count(wc, e);
931 need_flush = true;
932 }
933 writecache_add_to_freelist(wc, e);
934 } else {
935 struct wc_entry *old;
936
937 old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
938 if (!old) {
939 writecache_insert_entry(wc, e);
940 } else {
941 if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
942 writecache_error(wc, -EINVAL,
943 "two identical entries, position %llu, sector %llu, sequence %llu",
944 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
945 (unsigned long long)read_seq_count(wc, e));
946 }
947 if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
948 goto erase_this;
949 } else {
950 writecache_free_entry(wc, old);
951 writecache_insert_entry(wc, e);
952 need_flush = true;
953 }
954 }
955 }
956 cond_resched();
957 }
958
959 if (need_flush) {
960 writecache_flush_all_metadata(wc);
961 writecache_commit_flushed(wc);
962 }
963
964 wc_unlock(wc);
965}
966
967static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
968{
969 if (argc != 1)
970 return -EINVAL;
971
972 wc_lock(wc);
973 if (dm_suspended(wc->ti)) {
974 wc_unlock(wc);
975 return -EBUSY;
976 }
977 if (writecache_has_error(wc)) {
978 wc_unlock(wc);
979 return -EIO;
980 }
981
982 writecache_flush(wc);
983 wc->writeback_all++;
984 queue_work(wc->writeback_wq, &wc->writeback_work);
985 wc_unlock(wc);
986
987 flush_workqueue(wc->writeback_wq);
988
989 wc_lock(wc);
990 wc->writeback_all--;
991 if (writecache_has_error(wc)) {
992 wc_unlock(wc);
993 return -EIO;
994 }
995 wc_unlock(wc);
996
997 return 0;
998}
999
1000static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1001{
1002 if (argc != 1)
1003 return -EINVAL;
1004
1005 wc_lock(wc);
1006 wc->flush_on_suspend = true;
1007 wc_unlock(wc);
1008
1009 return 0;
1010}
1011
1012static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
1013 char *result, unsigned maxlen)
1014{
1015 int r = -EINVAL;
1016 struct dm_writecache *wc = ti->private;
1017
1018 if (!strcasecmp(argv[0], "flush"))
1019 r = process_flush_mesg(argc, argv, wc);
1020 else if (!strcasecmp(argv[0], "flush_on_suspend"))
1021 r = process_flush_on_suspend_mesg(argc, argv, wc);
1022 else
1023 DMERR("unrecognised message received: %s", argv[0]);
1024
1025 return r;
1026}
1027
1028static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
1029{
1030 void *buf;
1031 unsigned long flags;
1032 unsigned size;
1033 int rw = bio_data_dir(bio);
1034 unsigned remaining_size = wc->block_size;
1035
1036 do {
1037 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
1038 buf = bvec_kmap_irq(&bv, &flags);
1039 size = bv.bv_len;
1040 if (unlikely(size > remaining_size))
1041 size = remaining_size;
1042
1043 if (rw == READ) {
1044 int r;
1045 r = memcpy_mcsafe(buf, data, size);
1046 flush_dcache_page(bio_page(bio));
1047 if (unlikely(r)) {
1048 writecache_error(wc, r, "hardware memory error when reading data: %d", r);
1049 bio->bi_status = BLK_STS_IOERR;
1050 }
1051 } else {
1052 flush_dcache_page(bio_page(bio));
1053 memcpy_flushcache(data, buf, size);
1054 }
1055
1056 bvec_kunmap_irq(buf, &flags);
1057
1058 data = (char *)data + size;
1059 remaining_size -= size;
1060 bio_advance(bio, size);
1061 } while (unlikely(remaining_size));
1062}
1063
1064static int writecache_flush_thread(void *data)
1065{
1066 struct dm_writecache *wc = data;
1067
1068 while (1) {
1069 struct bio *bio;
1070
1071 wc_lock(wc);
1072 bio = bio_list_pop(&wc->flush_list);
1073 if (!bio) {
1074 set_current_state(TASK_INTERRUPTIBLE);
1075 wc_unlock(wc);
1076
1077 if (unlikely(kthread_should_stop())) {
1078 set_current_state(TASK_RUNNING);
1079 break;
1080 }
1081
1082 schedule();
1083 continue;
1084 }
1085
1086 if (bio_op(bio) == REQ_OP_DISCARD) {
1087 writecache_discard(wc, bio->bi_iter.bi_sector,
1088 bio_end_sector(bio));
1089 wc_unlock(wc);
1090 bio_set_dev(bio, wc->dev->bdev);
1091 generic_make_request(bio);
1092 } else {
1093 writecache_flush(wc);
1094 wc_unlock(wc);
1095 if (writecache_has_error(wc))
1096 bio->bi_status = BLK_STS_IOERR;
1097 bio_endio(bio);
1098 }
1099 }
1100
1101 return 0;
1102}
1103
1104static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
1105{
1106 if (bio_list_empty(&wc->flush_list))
1107 wake_up_process(wc->flush_thread);
1108 bio_list_add(&wc->flush_list, bio);
1109}
1110
1111static int writecache_map(struct dm_target *ti, struct bio *bio)
1112{
1113 struct wc_entry *e;
1114 struct dm_writecache *wc = ti->private;
1115
1116 bio->bi_private = NULL;
1117
1118 wc_lock(wc);
1119
1120 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1121 if (writecache_has_error(wc))
1122 goto unlock_error;
1123 if (WC_MODE_PMEM(wc)) {
1124 writecache_flush(wc);
1125 if (writecache_has_error(wc))
1126 goto unlock_error;
1127 goto unlock_submit;
1128 } else {
1129 writecache_offload_bio(wc, bio);
1130 goto unlock_return;
1131 }
1132 }
1133
1134 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1135
1136 if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
1137 (wc->block_size / 512 - 1)) != 0)) {
1138 DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1139 (unsigned long long)bio->bi_iter.bi_sector,
1140 bio->bi_iter.bi_size, wc->block_size);
1141 goto unlock_error;
1142 }
1143
1144 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1145 if (writecache_has_error(wc))
1146 goto unlock_error;
1147 if (WC_MODE_PMEM(wc)) {
1148 writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
1149 goto unlock_remap_origin;
1150 } else {
1151 writecache_offload_bio(wc, bio);
1152 goto unlock_return;
1153 }
1154 }
1155
1156 if (bio_data_dir(bio) == READ) {
1157read_next_block:
1158 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1159 if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
1160 if (WC_MODE_PMEM(wc)) {
1161 bio_copy_block(wc, bio, memory_data(wc, e));
1162 if (bio->bi_iter.bi_size)
1163 goto read_next_block;
1164 goto unlock_submit;
1165 } else {
1166 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1167 bio_set_dev(bio, wc->ssd_dev->bdev);
1168 bio->bi_iter.bi_sector = cache_sector(wc, e);
1169 if (!writecache_entry_is_committed(wc, e))
1170 writecache_wait_for_ios(wc, WRITE);
1171 goto unlock_remap;
1172 }
1173 } else {
1174 if (e) {
1175 sector_t next_boundary =
1176 read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1177 if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
1178 dm_accept_partial_bio(bio, next_boundary);
1179 }
1180 }
1181 goto unlock_remap_origin;
1182 }
1183 } else {
1184 do {
1185 if (writecache_has_error(wc))
1186 goto unlock_error;
1187 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
1188 if (e) {
1189 if (!writecache_entry_is_committed(wc, e))
1190 goto bio_copy;
1191 if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1192 wc->overwrote_committed = true;
1193 goto bio_copy;
1194 }
1195 }
1196 e = writecache_pop_from_freelist(wc);
1197 if (unlikely(!e)) {
1198 writecache_wait_on_freelist(wc);
1199 continue;
1200 }
1201 write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
1202 writecache_insert_entry(wc, e);
1203 wc->uncommitted_blocks++;
1204bio_copy:
1205 if (WC_MODE_PMEM(wc)) {
1206 bio_copy_block(wc, bio, memory_data(wc, e));
1207 } else {
1208 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1209 bio_set_dev(bio, wc->ssd_dev->bdev);
1210 bio->bi_iter.bi_sector = cache_sector(wc, e);
1211 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
1212 wc->uncommitted_blocks = 0;
1213 queue_work(wc->writeback_wq, &wc->flush_work);
1214 } else {
1215 writecache_schedule_autocommit(wc);
1216 }
1217 goto unlock_remap;
1218 }
1219 } while (bio->bi_iter.bi_size);
1220
1221 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks))
1222 writecache_flush(wc);
1223 else
1224 writecache_schedule_autocommit(wc);
1225 goto unlock_submit;
1226 }
1227
1228unlock_remap_origin:
1229 bio_set_dev(bio, wc->dev->bdev);
1230 wc_unlock(wc);
1231 return DM_MAPIO_REMAPPED;
1232
1233unlock_remap:
1234 /* make sure that writecache_end_io decrements bio_in_progress: */
1235 bio->bi_private = (void *)1;
1236 atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
1237 wc_unlock(wc);
1238 return DM_MAPIO_REMAPPED;
1239
1240unlock_submit:
1241 wc_unlock(wc);
1242 bio_endio(bio);
1243 return DM_MAPIO_SUBMITTED;
1244
1245unlock_return:
1246 wc_unlock(wc);
1247 return DM_MAPIO_SUBMITTED;
1248
1249unlock_error:
1250 wc_unlock(wc);
1251 bio_io_error(bio);
1252 return DM_MAPIO_SUBMITTED;
1253}
1254
1255static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
1256{
1257 struct dm_writecache *wc = ti->private;
1258
1259 if (bio->bi_private != NULL) {
1260 int dir = bio_data_dir(bio);
1261 if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
1262 if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
1263 wake_up(&wc->bio_in_progress_wait[dir]);
1264 }
1265 return 0;
1266}
1267
1268static int writecache_iterate_devices(struct dm_target *ti,
1269 iterate_devices_callout_fn fn, void *data)
1270{
1271 struct dm_writecache *wc = ti->private;
1272
1273 return fn(ti, wc->dev, 0, ti->len, data);
1274}
1275
1276static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
1277{
1278 struct dm_writecache *wc = ti->private;
1279
1280 if (limits->logical_block_size < wc->block_size)
1281 limits->logical_block_size = wc->block_size;
1282
1283 if (limits->physical_block_size < wc->block_size)
1284 limits->physical_block_size = wc->block_size;
1285
1286 if (limits->io_min < wc->block_size)
1287 limits->io_min = wc->block_size;
1288}
1289
1290
1291static void writecache_writeback_endio(struct bio *bio)
1292{
1293 struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
1294 struct dm_writecache *wc = wb->wc;
1295 unsigned long flags;
1296
1297 raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
1298 if (unlikely(list_empty(&wc->endio_list)))
1299 wake_up_process(wc->endio_thread);
1300 list_add_tail(&wb->endio_entry, &wc->endio_list);
1301 raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
1302}
1303
1304static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
1305{
1306 struct copy_struct *c = ptr;
1307 struct dm_writecache *wc = c->wc;
1308
1309 c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
1310
1311 raw_spin_lock_irq(&wc->endio_list_lock);
1312 if (unlikely(list_empty(&wc->endio_list)))
1313 wake_up_process(wc->endio_thread);
1314 list_add_tail(&c->endio_entry, &wc->endio_list);
1315 raw_spin_unlock_irq(&wc->endio_list_lock);
1316}
1317
1318static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
1319{
1320 unsigned i;
1321 struct writeback_struct *wb;
1322 struct wc_entry *e;
1323 unsigned long n_walked = 0;
1324
1325 do {
1326 wb = list_entry(list->next, struct writeback_struct, endio_entry);
1327 list_del(&wb->endio_entry);
1328
1329 if (unlikely(wb->bio.bi_status != BLK_STS_OK))
1330 writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
1331 "write error %d", wb->bio.bi_status);
1332 i = 0;
1333 do {
1334 e = wb->wc_list[i];
1335 BUG_ON(!e->write_in_progress);
1336 e->write_in_progress = false;
1337 INIT_LIST_HEAD(&e->lru);
1338 if (!writecache_has_error(wc))
1339 writecache_free_entry(wc, e);
1340 BUG_ON(!wc->writeback_size);
1341 wc->writeback_size--;
1342 n_walked++;
1343 if (unlikely(n_walked >= ENDIO_LATENCY)) {
1344 writecache_commit_flushed(wc);
1345 wc_unlock(wc);
1346 wc_lock(wc);
1347 n_walked = 0;
1348 }
1349 } while (++i < wb->wc_list_n);
1350
1351 if (wb->wc_list != wb->wc_list_inline)
1352 kfree(wb->wc_list);
1353 bio_put(&wb->bio);
1354 } while (!list_empty(list));
1355}
1356
1357static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
1358{
1359 struct copy_struct *c;
1360 struct wc_entry *e;
1361
1362 do {
1363 c = list_entry(list->next, struct copy_struct, endio_entry);
1364 list_del(&c->endio_entry);
1365
1366 if (unlikely(c->error))
1367 writecache_error(wc, c->error, "copy error");
1368
1369 e = c->e;
1370 do {
1371 BUG_ON(!e->write_in_progress);
1372 e->write_in_progress = false;
1373 INIT_LIST_HEAD(&e->lru);
1374 if (!writecache_has_error(wc))
1375 writecache_free_entry(wc, e);
1376
1377 BUG_ON(!wc->writeback_size);
1378 wc->writeback_size--;
1379 e++;
1380 } while (--c->n_entries);
1381 mempool_free(c, &wc->copy_pool);
1382 } while (!list_empty(list));
1383}
1384
1385static int writecache_endio_thread(void *data)
1386{
1387 struct dm_writecache *wc = data;
1388
1389 while (1) {
1390 struct list_head list;
1391
1392 raw_spin_lock_irq(&wc->endio_list_lock);
1393 if (!list_empty(&wc->endio_list))
1394 goto pop_from_list;
1395 set_current_state(TASK_INTERRUPTIBLE);
1396 raw_spin_unlock_irq(&wc->endio_list_lock);
1397
1398 if (unlikely(kthread_should_stop())) {
1399 set_current_state(TASK_RUNNING);
1400 break;
1401 }
1402
1403 schedule();
1404
1405 continue;
1406
1407pop_from_list:
1408 list = wc->endio_list;
1409 list.next->prev = list.prev->next = &list;
1410 INIT_LIST_HEAD(&wc->endio_list);
1411 raw_spin_unlock_irq(&wc->endio_list_lock);
1412
1413 if (!WC_MODE_FUA(wc))
1414 writecache_disk_flush(wc, wc->dev);
1415
1416 wc_lock(wc);
1417
1418 if (WC_MODE_PMEM(wc)) {
1419 __writecache_endio_pmem(wc, &list);
1420 } else {
1421 __writecache_endio_ssd(wc, &list);
1422 writecache_wait_for_ios(wc, READ);
1423 }
1424
1425 writecache_commit_flushed(wc);
1426
1427 wc_unlock(wc);
1428 }
1429
1430 return 0;
1431}
1432
1433static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp)
1434{
1435 struct dm_writecache *wc = wb->wc;
1436 unsigned block_size = wc->block_size;
1437 void *address = memory_data(wc, e);
1438
1439 persistent_memory_flush_cache(address, block_size);
1440 return bio_add_page(&wb->bio, persistent_memory_page(address),
1441 block_size, persistent_memory_page_offset(address)) != 0;
1442}
1443
1444struct writeback_list {
1445 struct list_head list;
1446 size_t size;
1447};
1448
1449static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
1450{
1451 if (unlikely(wc->max_writeback_jobs)) {
1452 if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
1453 wc_lock(wc);
1454 while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
1455 writecache_wait_on_freelist(wc);
1456 wc_unlock(wc);
1457 }
1458 }
1459 cond_resched();
1460}
1461
1462static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
1463{
1464 struct wc_entry *e, *f;
1465 struct bio *bio;
1466 struct writeback_struct *wb;
1467 unsigned max_pages;
1468
1469 while (wbl->size) {
1470 wbl->size--;
1471 e = container_of(wbl->list.prev, struct wc_entry, lru);
1472 list_del(&e->lru);
1473
1474 max_pages = e->wc_list_contiguous;
1475
1476 bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
1477 wb = container_of(bio, struct writeback_struct, bio);
1478 wb->wc = wc;
1479 wb->bio.bi_end_io = writecache_writeback_endio;
1480 bio_set_dev(&wb->bio, wc->dev->bdev);
1481 wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
1482 wb->page_offset = PAGE_SIZE;
1483 if (max_pages <= WB_LIST_INLINE ||
1484 unlikely(!(wb->wc_list = kmalloc(max_pages * sizeof(struct wc_entry *),
1485 GFP_NOIO | __GFP_NORETRY |
1486 __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
1487 wb->wc_list = wb->wc_list_inline;
1488 max_pages = WB_LIST_INLINE;
1489 }
1490
1491 BUG_ON(!wc_add_block(wb, e, GFP_NOIO));
1492
1493 wb->wc_list[0] = e;
1494 wb->wc_list_n = 1;
1495
1496 while (wbl->size && wb->wc_list_n < max_pages) {
1497 f = container_of(wbl->list.prev, struct wc_entry, lru);
1498 if (read_original_sector(wc, f) !=
1499 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1500 break;
1501 if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN))
1502 break;
1503 wbl->size--;
1504 list_del(&f->lru);
1505 wb->wc_list[wb->wc_list_n++] = f;
1506 e = f;
1507 }
1508 bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
1509 if (writecache_has_error(wc)) {
1510 bio->bi_status = BLK_STS_IOERR;
1511 bio_endio(&wb->bio);
1512 } else {
1513 submit_bio(&wb->bio);
1514 }
1515
1516 __writeback_throttle(wc, wbl);
1517 }
1518}
1519
1520static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
1521{
1522 struct wc_entry *e, *f;
1523 struct dm_io_region from, to;
1524 struct copy_struct *c;
1525
1526 while (wbl->size) {
1527 unsigned n_sectors;
1528
1529 wbl->size--;
1530 e = container_of(wbl->list.prev, struct wc_entry, lru);
1531 list_del(&e->lru);
1532
1533 n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
1534
1535 from.bdev = wc->ssd_dev->bdev;
1536 from.sector = cache_sector(wc, e);
1537 from.count = n_sectors;
1538 to.bdev = wc->dev->bdev;
1539 to.sector = read_original_sector(wc, e);
1540 to.count = n_sectors;
1541
1542 c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
1543 c->wc = wc;
1544 c->e = e;
1545 c->n_entries = e->wc_list_contiguous;
1546
1547 while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
1548 wbl->size--;
1549 f = container_of(wbl->list.prev, struct wc_entry, lru);
1550 BUG_ON(f != e + 1);
1551 list_del(&f->lru);
1552 e = f;
1553 }
1554
1555 dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
1556
1557 __writeback_throttle(wc, wbl);
1558 }
1559}
1560
1561static void writecache_writeback(struct work_struct *work)
1562{
1563 struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
1564 struct blk_plug plug;
1565 struct wc_entry *e, *f, *g;
1566 struct rb_node *node, *next_node;
1567 struct list_head skipped;
1568 struct writeback_list wbl;
1569 unsigned long n_walked;
1570
1571 wc_lock(wc);
1572restart:
1573 if (writecache_has_error(wc)) {
1574 wc_unlock(wc);
1575 return;
1576 }
1577
1578 if (unlikely(wc->writeback_all)) {
1579 if (writecache_wait_for_writeback(wc))
1580 goto restart;
1581 }
1582
1583 if (wc->overwrote_committed) {
1584 writecache_wait_for_ios(wc, WRITE);
1585 }
1586
1587 n_walked = 0;
1588 INIT_LIST_HEAD(&skipped);
1589 INIT_LIST_HEAD(&wbl.list);
1590 wbl.size = 0;
1591 while (!list_empty(&wc->lru) &&
1592 (wc->writeback_all ||
1593 wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) {
1594
1595 n_walked++;
1596 if (unlikely(n_walked > WRITEBACK_LATENCY) &&
1597 likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) {
1598 queue_work(wc->writeback_wq, &wc->writeback_work);
1599 break;
1600 }
1601
1602 e = container_of(wc->lru.prev, struct wc_entry, lru);
1603 BUG_ON(e->write_in_progress);
1604 if (unlikely(!writecache_entry_is_committed(wc, e))) {
1605 writecache_flush(wc);
1606 }
1607 node = rb_prev(&e->rb_node);
1608 if (node) {
1609 f = container_of(node, struct wc_entry, rb_node);
1610 if (unlikely(read_original_sector(wc, f) ==
1611 read_original_sector(wc, e))) {
1612 BUG_ON(!f->write_in_progress);
1613 list_del(&e->lru);
1614 list_add(&e->lru, &skipped);
1615 cond_resched();
1616 continue;
1617 }
1618 }
1619 wc->writeback_size++;
1620 list_del(&e->lru);
1621 list_add(&e->lru, &wbl.list);
1622 wbl.size++;
1623 e->write_in_progress = true;
1624 e->wc_list_contiguous = 1;
1625
1626 f = e;
1627
1628 while (1) {
1629 next_node = rb_next(&f->rb_node);
1630 if (unlikely(!next_node))
1631 break;
1632 g = container_of(next_node, struct wc_entry, rb_node);
1633 if (read_original_sector(wc, g) ==
1634 read_original_sector(wc, f)) {
1635 f = g;
1636 continue;
1637 }
1638 if (read_original_sector(wc, g) !=
1639 read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
1640 break;
1641 if (unlikely(g->write_in_progress))
1642 break;
1643 if (unlikely(!writecache_entry_is_committed(wc, g)))
1644 break;
1645
1646 if (!WC_MODE_PMEM(wc)) {
1647 if (g != f + 1)
1648 break;
1649 }
1650
1651 n_walked++;
1652 //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
1653 // break;
1654
1655 wc->writeback_size++;
1656 list_del(&g->lru);
1657 list_add(&g->lru, &wbl.list);
1658 wbl.size++;
1659 g->write_in_progress = true;
1660 g->wc_list_contiguous = BIO_MAX_PAGES;
1661 f = g;
1662 e->wc_list_contiguous++;
1663 if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES))
1664 break;
1665 }
1666 cond_resched();
1667 }
1668
1669 if (!list_empty(&skipped)) {
1670 list_splice_tail(&skipped, &wc->lru);
1671 /*
1672 * If we didn't do any progress, we must wait until some
1673 * writeback finishes to avoid burning CPU in a loop
1674 */
1675 if (unlikely(!wbl.size))
1676 writecache_wait_for_writeback(wc);
1677 }
1678
1679 wc_unlock(wc);
1680
1681 blk_start_plug(&plug);
1682
1683 if (WC_MODE_PMEM(wc))
1684 __writecache_writeback_pmem(wc, &wbl);
1685 else
1686 __writecache_writeback_ssd(wc, &wbl);
1687
1688 blk_finish_plug(&plug);
1689
1690 if (unlikely(wc->writeback_all)) {
1691 wc_lock(wc);
1692 while (writecache_wait_for_writeback(wc));
1693 wc_unlock(wc);
1694 }
1695}
1696
1697static int calculate_memory_size(uint64_t device_size, unsigned block_size,
1698 size_t *n_blocks_p, size_t *n_metadata_blocks_p)
1699{
1700 uint64_t n_blocks, offset;
1701 struct wc_entry e;
1702
1703 n_blocks = device_size;
1704 do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
1705
1706 while (1) {
1707 if (!n_blocks)
1708 return -ENOSPC;
1709 /* Verify the following entries[n_blocks] won't overflow */
1710 if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
1711 sizeof(struct wc_memory_entry)))
1712 return -EFBIG;
1713 offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
1714 offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
1715 if (offset + n_blocks * block_size <= device_size)
1716 break;
1717 n_blocks--;
1718 }
1719
1720 /* check if the bit field overflows */
1721 e.index = n_blocks;
1722 if (e.index != n_blocks)
1723 return -EFBIG;
1724
1725 if (n_blocks_p)
1726 *n_blocks_p = n_blocks;
1727 if (n_metadata_blocks_p)
1728 *n_metadata_blocks_p = offset >> __ffs(block_size);
1729 return 0;
1730}
1731
1732static int init_memory(struct dm_writecache *wc)
1733{
1734 size_t b;
1735 int r;
1736
1737 r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
1738 if (r)
1739 return r;
1740
1741 r = writecache_alloc_entries(wc);
1742 if (r)
1743 return r;
1744
1745 for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
1746 pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
1747 pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
1748 pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
1749 pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
1750 pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
1751
1752 for (b = 0; b < wc->n_blocks; b++)
1753 write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
1754
1755 writecache_flush_all_metadata(wc);
1756 writecache_commit_flushed(wc);
1757 pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
1758 writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
1759 writecache_commit_flushed(wc);
1760
1761 return 0;
1762}
1763
1764static void writecache_dtr(struct dm_target *ti)
1765{
1766 struct dm_writecache *wc = ti->private;
1767
1768 if (!wc)
1769 return;
1770
1771 if (wc->endio_thread)
1772 kthread_stop(wc->endio_thread);
1773
1774 if (wc->flush_thread)
1775 kthread_stop(wc->flush_thread);
1776
1777 bioset_exit(&wc->bio_set);
1778
1779 mempool_exit(&wc->copy_pool);
1780
1781 if (wc->writeback_wq)
1782 destroy_workqueue(wc->writeback_wq);
1783
1784 if (wc->dev)
1785 dm_put_device(ti, wc->dev);
1786
1787 if (wc->ssd_dev)
1788 dm_put_device(ti, wc->ssd_dev);
1789
1790 if (wc->entries)
1791 vfree(wc->entries);
1792
1793 if (wc->memory_map) {
1794 if (WC_MODE_PMEM(wc))
1795 persistent_memory_release(wc);
1796 else
1797 vfree(wc->memory_map);
1798 }
1799
1800 if (wc->dm_kcopyd)
1801 dm_kcopyd_client_destroy(wc->dm_kcopyd);
1802
1803 if (wc->dm_io)
1804 dm_io_client_destroy(wc->dm_io);
1805
1806 if (wc->dirty_bitmap)
1807 vfree(wc->dirty_bitmap);
1808
1809 kfree(wc);
1810}
1811
1812static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
1813{
1814 struct dm_writecache *wc;
1815 struct dm_arg_set as;
1816 const char *string;
1817 unsigned opt_params;
1818 size_t offset, data_size;
1819 int i, r;
1820 char dummy;
1821 int high_wm_percent = HIGH_WATERMARK;
1822 int low_wm_percent = LOW_WATERMARK;
1823 uint64_t x;
1824 struct wc_memory_superblock s;
1825
1826 static struct dm_arg _args[] = {
1827 {0, 10, "Invalid number of feature args"},
1828 };
1829
1830 as.argc = argc;
1831 as.argv = argv;
1832
1833 wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
1834 if (!wc) {
1835 ti->error = "Cannot allocate writecache structure";
1836 r = -ENOMEM;
1837 goto bad;
1838 }
1839 ti->private = wc;
1840 wc->ti = ti;
1841
1842 mutex_init(&wc->lock);
1843 writecache_poison_lists(wc);
1844 init_waitqueue_head(&wc->freelist_wait);
1845 timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
1846
1847 for (i = 0; i < 2; i++) {
1848 atomic_set(&wc->bio_in_progress[i], 0);
1849 init_waitqueue_head(&wc->bio_in_progress_wait[i]);
1850 }
1851
1852 wc->dm_io = dm_io_client_create();
1853 if (IS_ERR(wc->dm_io)) {
1854 r = PTR_ERR(wc->dm_io);
1855 ti->error = "Unable to allocate dm-io client";
1856 wc->dm_io = NULL;
1857 goto bad;
1858 }
1859
1860 wc->writeback_wq = alloc_workqueue("writecache-writeabck", WQ_MEM_RECLAIM, 1);
1861 if (!wc->writeback_wq) {
1862 r = -ENOMEM;
1863 ti->error = "Could not allocate writeback workqueue";
1864 goto bad;
1865 }
1866 INIT_WORK(&wc->writeback_work, writecache_writeback);
1867 INIT_WORK(&wc->flush_work, writecache_flush_work);
1868
1869 raw_spin_lock_init(&wc->endio_list_lock);
1870 INIT_LIST_HEAD(&wc->endio_list);
1871 wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
1872 if (IS_ERR(wc->endio_thread)) {
1873 r = PTR_ERR(wc->endio_thread);
1874 wc->endio_thread = NULL;
1875 ti->error = "Couldn't spawn endio thread";
1876 goto bad;
1877 }
1878 wake_up_process(wc->endio_thread);
1879
1880 /*
1881 * Parse the mode (pmem or ssd)
1882 */
1883 string = dm_shift_arg(&as);
1884 if (!string)
1885 goto bad_arguments;
1886
1887 if (!strcasecmp(string, "s")) {
1888 wc->pmem_mode = false;
1889 } else if (!strcasecmp(string, "p")) {
1890#ifdef DM_WRITECACHE_HAS_PMEM
1891 wc->pmem_mode = true;
1892 wc->writeback_fua = true;
1893#else
1894 /*
1895 * If the architecture doesn't support persistent memory or
1896 * the kernel doesn't support any DAX drivers, this driver can
1897 * only be used in SSD-only mode.
1898 */
1899 r = -EOPNOTSUPP;
1900 ti->error = "Persistent memory or DAX not supported on this system";
1901 goto bad;
1902#endif
1903 } else {
1904 goto bad_arguments;
1905 }
1906
1907 if (WC_MODE_PMEM(wc)) {
1908 r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
1909 offsetof(struct writeback_struct, bio),
1910 BIOSET_NEED_BVECS);
1911 if (r) {
1912 ti->error = "Could not allocate bio set";
1913 goto bad;
1914 }
1915 } else {
1916 r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
1917 if (r) {
1918 ti->error = "Could not allocate mempool";
1919 goto bad;
1920 }
1921 }
1922
1923 /*
1924 * Parse the origin data device
1925 */
1926 string = dm_shift_arg(&as);
1927 if (!string)
1928 goto bad_arguments;
1929 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
1930 if (r) {
1931 ti->error = "Origin data device lookup failed";
1932 goto bad;
1933 }
1934
1935 /*
1936 * Parse cache data device (be it pmem or ssd)
1937 */
1938 string = dm_shift_arg(&as);
1939 if (!string)
1940 goto bad_arguments;
1941
1942 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
1943 if (r) {
1944 ti->error = "Cache data device lookup failed";
1945 goto bad;
1946 }
1947 wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
1948
1949 if (WC_MODE_PMEM(wc)) {
1950 r = persistent_memory_claim(wc);
1951 if (r) {
1952 ti->error = "Unable to map persistent memory for cache";
1953 goto bad;
1954 }
1955 }
1956
1957 /*
1958 * Parse the cache block size
1959 */
1960 string = dm_shift_arg(&as);
1961 if (!string)
1962 goto bad_arguments;
1963 if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
1964 wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
1965 (wc->block_size & (wc->block_size - 1))) {
1966 r = -EINVAL;
1967 ti->error = "Invalid block size";
1968 goto bad;
1969 }
1970 wc->block_size_bits = __ffs(wc->block_size);
1971
1972 wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
1973 wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
1974 wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
1975
1976 /*
1977 * Parse optional arguments
1978 */
1979 r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
1980 if (r)
1981 goto bad;
1982
1983 while (opt_params) {
1984 string = dm_shift_arg(&as), opt_params--;
1985 if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
1986 string = dm_shift_arg(&as), opt_params--;
1987 if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
1988 goto invalid_optional;
1989 if (high_wm_percent < 0 || high_wm_percent > 100)
1990 goto invalid_optional;
1991 wc->high_wm_percent_set = true;
1992 } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
1993 string = dm_shift_arg(&as), opt_params--;
1994 if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
1995 goto invalid_optional;
1996 if (low_wm_percent < 0 || low_wm_percent > 100)
1997 goto invalid_optional;
1998 wc->low_wm_percent_set = true;
1999 } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
2000 string = dm_shift_arg(&as), opt_params--;
2001 if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
2002 goto invalid_optional;
2003 wc->max_writeback_jobs_set = true;
2004 } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
2005 string = dm_shift_arg(&as), opt_params--;
2006 if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
2007 goto invalid_optional;
2008 wc->autocommit_blocks_set = true;
2009 } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
2010 unsigned autocommit_msecs;
2011 string = dm_shift_arg(&as), opt_params--;
2012 if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
2013 goto invalid_optional;
2014 if (autocommit_msecs > 3600000)
2015 goto invalid_optional;
2016 wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
2017 wc->autocommit_time_set = true;
2018 } else if (!strcasecmp(string, "fua")) {
2019 if (WC_MODE_PMEM(wc)) {
2020 wc->writeback_fua = true;
2021 wc->writeback_fua_set = true;
2022 } else goto invalid_optional;
2023 } else if (!strcasecmp(string, "nofua")) {
2024 if (WC_MODE_PMEM(wc)) {
2025 wc->writeback_fua = false;
2026 wc->writeback_fua_set = true;
2027 } else goto invalid_optional;
2028 } else {
2029invalid_optional:
2030 r = -EINVAL;
2031 ti->error = "Invalid optional argument";
2032 goto bad;
2033 }
2034 }
2035
2036 if (high_wm_percent < low_wm_percent) {
2037 r = -EINVAL;
2038 ti->error = "High watermark must be greater than or equal to low watermark";
2039 goto bad;
2040 }
2041
2042 if (!WC_MODE_PMEM(wc)) {
2043 struct dm_io_region region;
2044 struct dm_io_request req;
2045 size_t n_blocks, n_metadata_blocks;
2046 uint64_t n_bitmap_bits;
2047
2048 bio_list_init(&wc->flush_list);
2049 wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush");
2050 if (IS_ERR(wc->flush_thread)) {
2051 r = PTR_ERR(wc->flush_thread);
2052 wc->flush_thread = NULL;
2053 ti->error = "Couldn't spawn endio thread";
2054 goto bad;
2055 }
2056 wake_up_process(wc->flush_thread);
2057
2058 r = calculate_memory_size(wc->memory_map_size, wc->block_size,
2059 &n_blocks, &n_metadata_blocks);
2060 if (r) {
2061 ti->error = "Invalid device size";
2062 goto bad;
2063 }
2064
2065 n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
2066 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
2067 /* this is limitation of test_bit functions */
2068 if (n_bitmap_bits > 1U << 31) {
2069 r = -EFBIG;
2070 ti->error = "Invalid device size";
2071 goto bad;
2072 }
2073
2074 wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
2075 if (!wc->memory_map) {
2076 r = -ENOMEM;
2077 ti->error = "Unable to allocate memory for metadata";
2078 goto bad;
2079 }
2080
2081 wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2082 if (IS_ERR(wc->dm_kcopyd)) {
2083 r = PTR_ERR(wc->dm_kcopyd);
2084 ti->error = "Unable to allocate dm-kcopyd client";
2085 wc->dm_kcopyd = NULL;
2086 goto bad;
2087 }
2088
2089 wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
2090 wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
2091 BITS_PER_LONG * sizeof(unsigned long);
2092 wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
2093 if (!wc->dirty_bitmap) {
2094 r = -ENOMEM;
2095 ti->error = "Unable to allocate dirty bitmap";
2096 goto bad;
2097 }
2098
2099 region.bdev = wc->ssd_dev->bdev;
2100 region.sector = 0;
2101 region.count = wc->metadata_sectors;
2102 req.bi_op = REQ_OP_READ;
2103 req.bi_op_flags = REQ_SYNC;
2104 req.mem.type = DM_IO_VMA;
2105 req.mem.ptr.vma = (char *)wc->memory_map;
2106 req.client = wc->dm_io;
2107 req.notify.fn = NULL;
2108
2109 r = dm_io(&req, 1, &region, NULL);
2110 if (r) {
2111 ti->error = "Unable to read metadata";
2112 goto bad;
2113 }
2114 }
2115
2116 r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2117 if (r) {
2118 ti->error = "Hardware memory error when reading superblock";
2119 goto bad;
2120 }
2121 if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
2122 r = init_memory(wc);
2123 if (r) {
2124 ti->error = "Unable to initialize device";
2125 goto bad;
2126 }
2127 r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2128 if (r) {
2129 ti->error = "Hardware memory error when reading superblock";
2130 goto bad;
2131 }
2132 }
2133
2134 if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
2135 ti->error = "Invalid magic in the superblock";
2136 r = -EINVAL;
2137 goto bad;
2138 }
2139
2140 if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
2141 ti->error = "Invalid version in the superblock";
2142 r = -EINVAL;
2143 goto bad;
2144 }
2145
2146 if (le32_to_cpu(s.block_size) != wc->block_size) {
2147 ti->error = "Block size does not match superblock";
2148 r = -EINVAL;
2149 goto bad;
2150 }
2151
2152 wc->n_blocks = le64_to_cpu(s.n_blocks);
2153
2154 offset = wc->n_blocks * sizeof(struct wc_memory_entry);
2155 if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
2156overflow:
2157 ti->error = "Overflow in size calculation";
2158 r = -EINVAL;
2159 goto bad;
2160 }
2161 offset += sizeof(struct wc_memory_superblock);
2162 if (offset < sizeof(struct wc_memory_superblock))
2163 goto overflow;
2164 offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
2165 data_size = wc->n_blocks * (size_t)wc->block_size;
2166 if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
2167 (offset + data_size < offset))
2168 goto overflow;
2169 if (offset + data_size > wc->memory_map_size) {
2170 ti->error = "Memory area is too small";
2171 r = -EINVAL;
2172 goto bad;
2173 }
2174
2175 wc->metadata_sectors = offset >> SECTOR_SHIFT;
2176 wc->block_start = (char *)sb(wc) + offset;
2177
2178 x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
2179 x += 50;
2180 do_div(x, 100);
2181 wc->freelist_high_watermark = x;
2182 x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
2183 x += 50;
2184 do_div(x, 100);
2185 wc->freelist_low_watermark = x;
2186
2187 r = writecache_alloc_entries(wc);
2188 if (r) {
2189 ti->error = "Cannot allocate memory";
2190 goto bad;
2191 }
2192
2193 ti->num_flush_bios = 1;
2194 ti->flush_supported = true;
2195 ti->num_discard_bios = 1;
2196
2197 if (WC_MODE_PMEM(wc))
2198 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
2199
2200 return 0;
2201
2202bad_arguments:
2203 r = -EINVAL;
2204 ti->error = "Bad arguments";
2205bad:
2206 writecache_dtr(ti);
2207 return r;
2208}
2209
2210static void writecache_status(struct dm_target *ti, status_type_t type,
2211 unsigned status_flags, char *result, unsigned maxlen)
2212{
2213 struct dm_writecache *wc = ti->private;
2214 unsigned extra_args;
2215 unsigned sz = 0;
2216 uint64_t x;
2217
2218 switch (type) {
2219 case STATUSTYPE_INFO:
2220 DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc),
2221 (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
2222 (unsigned long long)wc->writeback_size);
2223 break;
2224 case STATUSTYPE_TABLE:
2225 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
2226 wc->dev->name, wc->ssd_dev->name, wc->block_size);
2227 extra_args = 0;
2228 if (wc->high_wm_percent_set)
2229 extra_args += 2;
2230 if (wc->low_wm_percent_set)
2231 extra_args += 2;
2232 if (wc->max_writeback_jobs_set)
2233 extra_args += 2;
2234 if (wc->autocommit_blocks_set)
2235 extra_args += 2;
2236 if (wc->autocommit_time_set)
2237 extra_args += 2;
2238 if (wc->writeback_fua_set)
2239 extra_args++;
2240
2241 DMEMIT("%u", extra_args);
2242 if (wc->high_wm_percent_set) {
2243 x = (uint64_t)wc->freelist_high_watermark * 100;
2244 x += wc->n_blocks / 2;
2245 do_div(x, (size_t)wc->n_blocks);
2246 DMEMIT(" high_watermark %u", 100 - (unsigned)x);
2247 }
2248 if (wc->low_wm_percent_set) {
2249 x = (uint64_t)wc->freelist_low_watermark * 100;
2250 x += wc->n_blocks / 2;
2251 do_div(x, (size_t)wc->n_blocks);
2252 DMEMIT(" low_watermark %u", 100 - (unsigned)x);
2253 }
2254 if (wc->max_writeback_jobs_set)
2255 DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
2256 if (wc->autocommit_blocks_set)
2257 DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
2258 if (wc->autocommit_time_set)
2259 DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
2260 if (wc->writeback_fua_set)
2261 DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
2262 break;
2263 }
2264}
2265
2266static struct target_type writecache_target = {
2267 .name = "writecache",
2268 .version = {1, 0, 0},
2269 .module = THIS_MODULE,
2270 .ctr = writecache_ctr,
2271 .dtr = writecache_dtr,
2272 .status = writecache_status,
2273 .postsuspend = writecache_suspend,
2274 .resume = writecache_resume,
2275 .message = writecache_message,
2276 .map = writecache_map,
2277 .end_io = writecache_end_io,
2278 .iterate_devices = writecache_iterate_devices,
2279 .io_hints = writecache_io_hints,
2280};
2281
2282static int __init dm_writecache_init(void)
2283{
2284 int r;
2285
2286 r = dm_register_target(&writecache_target);
2287 if (r < 0) {
2288 DMERR("register failed %d", r);
2289 return r;
2290 }
2291
2292 return 0;
2293}
2294
2295static void __exit dm_writecache_exit(void)
2296{
2297 dm_unregister_target(&writecache_target);
2298}
2299
2300module_init(dm_writecache_init);
2301module_exit(dm_writecache_exit);
2302
2303MODULE_DESCRIPTION(DM_NAME " writecache target");
2304MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2305MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 30602d15ad9a..3c0e45f4dcf5 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -52,9 +52,9 @@ struct dmz_target {
52 struct dmz_reclaim *reclaim; 52 struct dmz_reclaim *reclaim;
53 53
54 /* For chunk work */ 54 /* For chunk work */
55 struct mutex chunk_lock;
56 struct radix_tree_root chunk_rxtree; 55 struct radix_tree_root chunk_rxtree;
57 struct workqueue_struct *chunk_wq; 56 struct workqueue_struct *chunk_wq;
57 struct mutex chunk_lock;
58 58
59 /* For cloned BIOs to zones */ 59 /* For cloned BIOs to zones */
60 struct bio_set bio_set; 60 struct bio_set bio_set;