aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMikulas Patocka <mpatocka@redhat.com>2018-03-08 08:25:24 -0500
committerMike Snitzer <snitzer@redhat.com>2018-06-08 11:59:51 -0400
commit48debafe4f2feabcc99f8e2659e80557e3ca6b39 (patch)
tree898a7c9c33238b068a79d40e97c380b36b1498ee
parent72d711c8768805b5f8cf2d23c575dfd188993e12 (diff)
dm: add writecache target
The writecache target caches writes on persistent memory or SSD. It is intended for databases or other programs that need extremely low commit latency. The writecache target doesn't cache reads because reads are supposed to be cached in page cache in normal RAM. If persistent memory isn't available this target can still be used in SSD mode. Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Colin Ian King <colin.king@canonical.com> # fix missing goto Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> # fix compilation issue with !DAX Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com> # use msecs_to_jiffies Acked-by: Dan Williams <dan.j.williams@intel.com> # reworks to unify ARM and x86 flushing Signed-off-by: Mike Snitzer <msnitzer@redhat.com>
-rw-r--r--Documentation/device-mapper/writecache.txt68
-rw-r--r--drivers/md/Kconfig11
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-writecache.c2305
4 files changed, 2385 insertions, 0 deletions
diff --git a/Documentation/device-mapper/writecache.txt b/Documentation/device-mapper/writecache.txt
new file mode 100644
index 000000000000..4424fa2c67d7
--- /dev/null
+++ b/Documentation/device-mapper/writecache.txt
@@ -0,0 +1,68 @@
1The writecache target caches writes on persistent memory or on SSD. It
2doesn't cache reads because reads are supposed to be cached in page cache
3in normal RAM.
4
5When the device is constructed, the first sector should be zeroed or the
6first sector should contain valid superblock from previous invocation.
7
8Constructor parameters:
91. type of the cache device - "p" or "s"
10 p - persistent memory
11 s - SSD
122. the underlying device that will be cached
133. the cache device
144. block size (4096 is recommended; the maximum block size is the page
15 size)
165. the number of optional parameters (the parameters with an argument
17 count as two)
18 high_watermark n (default: 50)
19 start writeback when the number of used blocks reach this
20 watermark
21 low_watermark x (default: 45)
22 stop writeback when the number of used blocks drops below
23 this watermark
24 writeback_jobs n (default: unlimited)
25 limit the number of blocks that are in flight during
26 writeback. Setting this value reduces writeback
27 throughput, but it may improve latency of read requests
28 autocommit_blocks n (default: 64 for pmem, 65536 for ssd)
29 when the application writes this amount of blocks without
30 issuing the FLUSH request, the blocks are automatically
31 commited
32 autocommit_time ms (default: 1000)
33 autocommit time in milliseconds. The data is automatically
34 commited if this time passes and no FLUSH request is
35 received
36 fua (by default on)
37 applicable only to persistent memory - use the FUA flag
38 when writing data from persistent memory back to the
39 underlying device
40 nofua
41 applicable only to persistent memory - don't use the FUA
42 flag when writing back data and send the FLUSH request
43 afterwards
44 - some underlying devices perform better with fua, some
45 with nofua. The user should test it
46
47Status:
481. error indicator - 0 if there was no error, otherwise error number
492. the number of blocks
503. the number of free blocks
514. the number of blocks under writeback
52
53Messages:
54 flush
55 flush the cache device. The message returns successfully
56 if the cache device was flushed without an error
57 flush_on_suspend
58 flush the cache device on next suspend. Use this message
59 when you are going to remove the cache device. The proper
60 sequence for removing the cache device is:
61 1. send the "flush_on_suspend" message
62 2. load an inactive table with a linear target that maps
63 to the underlying device
64 3. suspend the device
65 4. ask for status and verify that there are no errors
66 5. resume the device, so that it will use the linear
67 target
68 6. the cache device is now inactive and it can be deleted
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index edff083f7c4e..8b8c123cae66 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -334,6 +334,17 @@ config DM_CACHE_SMQ
334 of less memory utilization, improved performance and increased 334 of less memory utilization, improved performance and increased
335 adaptability in the face of changing workloads. 335 adaptability in the face of changing workloads.
336 336
337config DM_WRITECACHE
338 tristate "Writecache target"
339 depends on BLK_DEV_DM
340 ---help---
341 The writecache target caches writes on persistent memory or SSD.
342 It is intended for databases or other programs that need extremely
343 low commit latency.
344
345 The writecache target doesn't cache reads because reads are supposed
346 to be cached in standard RAM.
347
337config DM_ERA 348config DM_ERA
338 tristate "Era target (EXPERIMENTAL)" 349 tristate "Era target (EXPERIMENTAL)"
339 depends on BLK_DEV_DM 350 depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 63255f3ebd97..822f4e8753bc 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -67,6 +67,7 @@ obj-$(CONFIG_DM_ERA) += dm-era.o
67obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o 67obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
68obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o 68obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o
69obj-$(CONFIG_DM_ZONED) += dm-zoned.o 69obj-$(CONFIG_DM_ZONED) += dm-zoned.o
70obj-$(CONFIG_DM_WRITECACHE) += dm-writecache.o
70 71
71ifeq ($(CONFIG_DM_UEVENT),y) 72ifeq ($(CONFIG_DM_UEVENT),y)
72dm-mod-objs += dm-uevent.o 73dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
new file mode 100644
index 000000000000..5961c7794ef3
--- /dev/null
+++ b/drivers/md/dm-writecache.c
@@ -0,0 +1,2305 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2018 Red Hat. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include <linux/device-mapper.h>
9#include <linux/module.h>
10#include <linux/init.h>
11#include <linux/vmalloc.h>
12#include <linux/kthread.h>
13#include <linux/dm-io.h>
14#include <linux/dm-kcopyd.h>
15#include <linux/dax.h>
16#include <linux/pfn_t.h>
17#include <linux/libnvdimm.h>
18
19#define DM_MSG_PREFIX "writecache"
20
21#define HIGH_WATERMARK 50
22#define LOW_WATERMARK 45
23#define MAX_WRITEBACK_JOBS 0
24#define ENDIO_LATENCY 16
25#define WRITEBACK_LATENCY 64
26#define AUTOCOMMIT_BLOCKS_SSD 65536
27#define AUTOCOMMIT_BLOCKS_PMEM 64
28#define AUTOCOMMIT_MSEC 1000
29
30#define BITMAP_GRANULARITY 65536
31#if BITMAP_GRANULARITY < PAGE_SIZE
32#undef BITMAP_GRANULARITY
33#define BITMAP_GRANULARITY PAGE_SIZE
34#endif
35
36#if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
37#define DM_WRITECACHE_HAS_PMEM
38#endif
39
40#ifdef DM_WRITECACHE_HAS_PMEM
41#define pmem_assign(dest, src) \
42do { \
43 typeof(dest) uniq = (src); \
44 memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \
45} while (0)
46#else
47#define pmem_assign(dest, src) ((dest) = (src))
48#endif
49
50#if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM)
51#define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
52#endif
53
54#define MEMORY_SUPERBLOCK_MAGIC 0x23489321
55#define MEMORY_SUPERBLOCK_VERSION 1
56
57struct wc_memory_entry {
58 __le64 original_sector;
59 __le64 seq_count;
60};
61
62struct wc_memory_superblock {
63 union {
64 struct {
65 __le32 magic;
66 __le32 version;
67 __le32 block_size;
68 __le32 pad;
69 __le64 n_blocks;
70 __le64 seq_count;
71 };
72 __le64 padding[8];
73 };
74 struct wc_memory_entry entries[0];
75};
76
77struct wc_entry {
78 struct rb_node rb_node;
79 struct list_head lru;
80 unsigned short wc_list_contiguous;
81 bool write_in_progress
82#if BITS_PER_LONG == 64
83 :1
84#endif
85 ;
86 unsigned long index
87#if BITS_PER_LONG == 64
88 :47
89#endif
90 ;
91#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
92 uint64_t original_sector;
93 uint64_t seq_count;
94#endif
95};
96
97#ifdef DM_WRITECACHE_HAS_PMEM
98#define WC_MODE_PMEM(wc) ((wc)->pmem_mode)
99#define WC_MODE_FUA(wc) ((wc)->writeback_fua)
100#else
101#define WC_MODE_PMEM(wc) false
102#define WC_MODE_FUA(wc) false
103#endif
104#define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc))
105
106struct dm_writecache {
107 struct mutex lock;
108 struct list_head lru;
109 union {
110 struct list_head freelist;
111 struct {
112 struct rb_root freetree;
113 struct wc_entry *current_free;
114 };
115 };
116 struct rb_root tree;
117
118 size_t freelist_size;
119 size_t writeback_size;
120 size_t freelist_high_watermark;
121 size_t freelist_low_watermark;
122
123 unsigned uncommitted_blocks;
124 unsigned autocommit_blocks;
125 unsigned max_writeback_jobs;
126
127 int error;
128
129 unsigned long autocommit_jiffies;
130 struct timer_list autocommit_timer;
131 struct wait_queue_head freelist_wait;
132
133 atomic_t bio_in_progress[2];
134 struct wait_queue_head bio_in_progress_wait[2];
135
136 struct dm_target *ti;
137 struct dm_dev *dev;
138 struct dm_dev *ssd_dev;
139 void *memory_map;
140 uint64_t memory_map_size;
141 size_t metadata_sectors;
142 size_t n_blocks;
143 uint64_t seq_count;
144 void *block_start;
145 struct wc_entry *entries;
146 unsigned block_size;
147 unsigned char block_size_bits;
148
149 bool pmem_mode:1;
150 bool writeback_fua:1;
151
152 bool overwrote_committed:1;
153 bool memory_vmapped:1;
154
155 bool high_wm_percent_set:1;
156 bool low_wm_percent_set:1;
157 bool max_writeback_jobs_set:1;
158 bool autocommit_blocks_set:1;
159 bool autocommit_time_set:1;
160 bool writeback_fua_set:1;
161 bool flush_on_suspend:1;
162
163 unsigned writeback_all;
164 struct workqueue_struct *writeback_wq;
165 struct work_struct writeback_work;
166 struct work_struct flush_work;
167
168 struct dm_io_client *dm_io;
169
170 raw_spinlock_t endio_list_lock;
171 struct list_head endio_list;
172 struct task_struct *endio_thread;
173
174 struct task_struct *flush_thread;
175 struct bio_list flush_list;
176
177 struct dm_kcopyd_client *dm_kcopyd;
178 unsigned long *dirty_bitmap;
179 unsigned dirty_bitmap_size;
180
181 struct bio_set bio_set;
182 mempool_t copy_pool;
183};
184
185#define WB_LIST_INLINE 16
186
187struct writeback_struct {
188 struct list_head endio_entry;
189 struct dm_writecache *wc;
190 struct wc_entry **wc_list;
191 unsigned wc_list_n;
192 unsigned page_offset;
193 struct page *page;
194 struct wc_entry *wc_list_inline[WB_LIST_INLINE];
195 struct bio bio;
196};
197
198struct copy_struct {
199 struct list_head endio_entry;
200 struct dm_writecache *wc;
201 struct wc_entry *e;
202 unsigned n_entries;
203 int error;
204};
205
206DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
207 "A percentage of time allocated for data copying");
208
209static void wc_lock(struct dm_writecache *wc)
210{
211 mutex_lock(&wc->lock);
212}
213
214static void wc_unlock(struct dm_writecache *wc)
215{
216 mutex_unlock(&wc->lock);
217}
218
219#ifdef DM_WRITECACHE_HAS_PMEM
220static int persistent_memory_claim(struct dm_writecache *wc)
221{
222 int r;
223 loff_t s;
224 long p, da;
225 pfn_t pfn;
226 int id;
227 struct page **pages;
228
229 wc->memory_vmapped = false;
230
231 if (!wc->ssd_dev->dax_dev) {
232 r = -EOPNOTSUPP;
233 goto err1;
234 }
235 s = wc->memory_map_size;
236 p = s >> PAGE_SHIFT;
237 if (!p) {
238 r = -EINVAL;
239 goto err1;
240 }
241 if (p != s >> PAGE_SHIFT) {
242 r = -EOVERFLOW;
243 goto err1;
244 }
245
246 id = dax_read_lock();
247
248 da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn);
249 if (da < 0) {
250 wc->memory_map = NULL;
251 r = da;
252 goto err2;
253 }
254 if (!pfn_t_has_page(pfn)) {
255 wc->memory_map = NULL;
256 r = -EOPNOTSUPP;
257 goto err2;
258 }
259 if (da != p) {
260 long i;
261 wc->memory_map = NULL;
262 pages = kvmalloc(p * sizeof(struct page *), GFP_KERNEL);
263 if (!pages) {
264 r = -ENOMEM;
265 goto err2;
266 }
267 i = 0;
268 do {
269 long daa;
270 void *dummy_addr;
271 daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i,
272 &dummy_addr, &pfn);
273 if (daa <= 0) {
274 r = daa ? daa : -EINVAL;
275 goto err3;
276 }
277 if (!pfn_t_has_page(pfn)) {
278 r = -EOPNOTSUPP;
279 goto err3;
280 }
281 while (daa-- && i < p) {
282 pages[i++] = pfn_t_to_page(pfn);
283 pfn.val++;
284 }
285 } while (i < p);
286 wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
287 if (!wc->memory_map) {
288 r = -ENOMEM;
289 goto err3;
290 }
291 kvfree(pages);
292 wc->memory_vmapped = true;
293 }
294
295 dax_read_unlock(id);
296 return 0;
297err3:
298 kvfree(pages);
299err2:
300 dax_read_unlock(id);
301err1:
302 return r;
303}
304#else
305static int persistent_memory_claim(struct dm_writecache *wc)
306{
307 BUG();
308}
309#endif
310
311static void persistent_memory_release(struct dm_writecache *wc)
312{
313 if (wc->memory_vmapped)
314 vunmap(wc->memory_map);
315}
316
317static struct page *persistent_memory_page(void *addr)
318{
319 if (is_vmalloc_addr(addr))
320 return vmalloc_to_page(addr);
321 else
322 return virt_to_page(addr);
323}
324
325static unsigned persistent_memory_page_offset(void *addr)
326{
327 return (unsigned long)addr & (PAGE_SIZE - 1);
328}
329
330static void persistent_memory_flush_cache(void *ptr, size_t size)
331{
332 if (is_vmalloc_addr(ptr))
333 flush_kernel_vmap_range(ptr, size);
334}
335
336static void persistent_memory_invalidate_cache(void *ptr, size_t size)
337{
338 if (is_vmalloc_addr(ptr))
339 invalidate_kernel_vmap_range(ptr, size);
340}
341
342static struct wc_memory_superblock *sb(struct dm_writecache *wc)
343{
344 return wc->memory_map;
345}
346
347static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
348{
349 if (is_power_of_2(sizeof(struct wc_entry)) && 0)
350 return &sb(wc)->entries[e - wc->entries];
351 else
352 return &sb(wc)->entries[e->index];
353}
354
355static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
356{
357 return (char *)wc->block_start + (e->index << wc->block_size_bits);
358}
359
360static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
361{
362 return wc->metadata_sectors +
363 ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
364}
365
366static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
367{
368#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
369 return e->original_sector;
370#else
371 return le64_to_cpu(memory_entry(wc, e)->original_sector);
372#endif
373}
374
375static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
376{
377#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
378 return e->seq_count;
379#else
380 return le64_to_cpu(memory_entry(wc, e)->seq_count);
381#endif
382}
383
384static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
385{
386#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
387 e->seq_count = -1;
388#endif
389 pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
390}
391
392static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
393 uint64_t original_sector, uint64_t seq_count)
394{
395 struct wc_memory_entry me;
396#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
397 e->original_sector = original_sector;
398 e->seq_count = seq_count;
399#endif
400 me.original_sector = cpu_to_le64(original_sector);
401 me.seq_count = cpu_to_le64(seq_count);
402 pmem_assign(*memory_entry(wc, e), me);
403}
404
405#define writecache_error(wc, err, msg, arg...) \
406do { \
407 if (!cmpxchg(&(wc)->error, 0, err)) \
408 DMERR(msg, ##arg); \
409 wake_up(&(wc)->freelist_wait); \
410} while (0)
411
412#define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error)))
413
414static void writecache_flush_all_metadata(struct dm_writecache *wc)
415{
416 if (!WC_MODE_PMEM(wc))
417 memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
418}
419
420static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
421{
422 if (!WC_MODE_PMEM(wc))
423 __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
424 wc->dirty_bitmap);
425}
426
427static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
428
429struct io_notify {
430 struct dm_writecache *wc;
431 struct completion c;
432 atomic_t count;
433};
434
435static void writecache_notify_io(unsigned long error, void *context)
436{
437 struct io_notify *endio = context;
438
439 if (unlikely(error != 0))
440 writecache_error(endio->wc, -EIO, "error writing metadata");
441 BUG_ON(atomic_read(&endio->count) <= 0);
442 if (atomic_dec_and_test(&endio->count))
443 complete(&endio->c);
444}
445
446static void ssd_commit_flushed(struct dm_writecache *wc)
447{
448 struct dm_io_region region;
449 struct dm_io_request req;
450 struct io_notify endio = {
451 wc,
452 COMPLETION_INITIALIZER_ONSTACK(endio.c),
453 ATOMIC_INIT(1),
454 };
455 unsigned bitmap_bits = wc->dirty_bitmap_size * BITS_PER_LONG;
456 unsigned i = 0;
457
458 while (1) {
459 unsigned j;
460 i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
461 if (unlikely(i == bitmap_bits))
462 break;
463 j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
464
465 region.bdev = wc->ssd_dev->bdev;
466 region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
467 region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
468
469 if (unlikely(region.sector >= wc->metadata_sectors))
470 break;
471 if (unlikely(region.sector + region.count > wc->metadata_sectors))
472 region.count = wc->metadata_sectors - region.sector;
473
474 atomic_inc(&endio.count);
475 req.bi_op = REQ_OP_WRITE;
476 req.bi_op_flags = REQ_SYNC;
477 req.mem.type = DM_IO_VMA;
478 req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
479 req.client = wc->dm_io;
480 req.notify.fn = writecache_notify_io;
481 req.notify.context = &endio;
482
483 /* writing via async dm-io (implied by notify.fn above) won't return an error */
484 (void) dm_io(&req, 1, &region, NULL);
485 i = j;
486 }
487
488 writecache_notify_io(0, &endio);
489 wait_for_completion_io(&endio.c);
490
491 writecache_disk_flush(wc, wc->ssd_dev);
492
493 memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
494}
495
496static void writecache_commit_flushed(struct dm_writecache *wc)
497{
498 if (WC_MODE_PMEM(wc))
499 wmb();
500 else
501 ssd_commit_flushed(wc);
502}
503
504static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
505{
506 int r;
507 struct dm_io_region region;
508 struct dm_io_request req;
509
510 region.bdev = dev->bdev;
511 region.sector = 0;
512 region.count = 0;
513 req.bi_op = REQ_OP_WRITE;
514 req.bi_op_flags = REQ_PREFLUSH;
515 req.mem.type = DM_IO_KMEM;
516 req.mem.ptr.addr = NULL;
517 req.client = wc->dm_io;
518 req.notify.fn = NULL;
519
520 r = dm_io(&req, 1, &region, NULL);
521 if (unlikely(r))
522 writecache_error(wc, r, "error flushing metadata: %d", r);
523}
524
525static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
526{
527 wait_event(wc->bio_in_progress_wait[direction],
528 !atomic_read(&wc->bio_in_progress[direction]));
529}
530
531#define WFE_RETURN_FOLLOWING 1
532#define WFE_LOWEST_SEQ 2
533
534static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
535 uint64_t block, int flags)
536{
537 struct wc_entry *e;
538 struct rb_node *node = wc->tree.rb_node;
539
540 if (unlikely(!node))
541 return NULL;
542
543 while (1) {
544 e = container_of(node, struct wc_entry, rb_node);
545 if (read_original_sector(wc, e) == block)
546 break;
547 node = (read_original_sector(wc, e) >= block ?
548 e->rb_node.rb_left : e->rb_node.rb_right);
549 if (unlikely(!node)) {
550 if (!(flags & WFE_RETURN_FOLLOWING)) {
551 return NULL;
552 }
553 if (read_original_sector(wc, e) >= block) {
554 break;
555 } else {
556 node = rb_next(&e->rb_node);
557 if (unlikely(!node)) {
558 return NULL;
559 }
560 e = container_of(node, struct wc_entry, rb_node);
561 break;
562 }
563 }
564 }
565
566 while (1) {
567 struct wc_entry *e2;
568 if (flags & WFE_LOWEST_SEQ)
569 node = rb_prev(&e->rb_node);
570 else
571 node = rb_next(&e->rb_node);
572 if (!node)
573 return e;
574 e2 = container_of(node, struct wc_entry, rb_node);
575 if (read_original_sector(wc, e2) != block)
576 return e;
577 e = e2;
578 }
579}
580
581static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
582{
583 struct wc_entry *e;
584 struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
585
586 while (*node) {
587 e = container_of(*node, struct wc_entry, rb_node);
588 parent = &e->rb_node;
589 if (read_original_sector(wc, e) > read_original_sector(wc, ins))
590 node = &parent->rb_left;
591 else
592 node = &parent->rb_right;
593 }
594 rb_link_node(&ins->rb_node, parent, node);
595 rb_insert_color(&ins->rb_node, &wc->tree);
596 list_add(&ins->lru, &wc->lru);
597}
598
599static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
600{
601 list_del(&e->lru);
602 rb_erase(&e->rb_node, &wc->tree);
603}
604
605static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
606{
607 if (WC_MODE_SORT_FREELIST(wc)) {
608 struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
609 if (unlikely(!*node))
610 wc->current_free = e;
611 while (*node) {
612 parent = *node;
613 if (&e->rb_node < *node)
614 node = &parent->rb_left;
615 else
616 node = &parent->rb_right;
617 }
618 rb_link_node(&e->rb_node, parent, node);
619 rb_insert_color(&e->rb_node, &wc->freetree);
620 } else {
621 list_add_tail(&e->lru, &wc->freelist);
622 }
623 wc->freelist_size++;
624}
625
626static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
627{
628 struct wc_entry *e;
629
630 if (WC_MODE_SORT_FREELIST(wc)) {
631 struct rb_node *next;
632 if (unlikely(!wc->current_free))
633 return NULL;
634 e = wc->current_free;
635 next = rb_next(&e->rb_node);
636 rb_erase(&e->rb_node, &wc->freetree);
637 if (unlikely(!next))
638 next = rb_first(&wc->freetree);
639 wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
640 } else {
641 if (unlikely(list_empty(&wc->freelist)))
642 return NULL;
643 e = container_of(wc->freelist.next, struct wc_entry, lru);
644 list_del(&e->lru);
645 }
646 wc->freelist_size--;
647 if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
648 queue_work(wc->writeback_wq, &wc->writeback_work);
649
650 return e;
651}
652
653static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
654{
655 writecache_unlink(wc, e);
656 writecache_add_to_freelist(wc, e);
657 clear_seq_count(wc, e);
658 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
659 if (unlikely(waitqueue_active(&wc->freelist_wait)))
660 wake_up(&wc->freelist_wait);
661}
662
663static void writecache_wait_on_freelist(struct dm_writecache *wc)
664{
665 DEFINE_WAIT(wait);
666
667 prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
668 wc_unlock(wc);
669 io_schedule();
670 finish_wait(&wc->freelist_wait, &wait);
671 wc_lock(wc);
672}
673
674static void writecache_poison_lists(struct dm_writecache *wc)
675{
676 /*
677 * Catch incorrect access to these values while the device is suspended.
678 */
679 memset(&wc->tree, -1, sizeof wc->tree);
680 wc->lru.next = LIST_POISON1;
681 wc->lru.prev = LIST_POISON2;
682 wc->freelist.next = LIST_POISON1;
683 wc->freelist.prev = LIST_POISON2;
684}
685
686static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
687{
688 writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
689 if (WC_MODE_PMEM(wc))
690 writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
691}
692
693static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
694{
695 return read_seq_count(wc, e) < wc->seq_count;
696}
697
698static void writecache_flush(struct dm_writecache *wc)
699{
700 struct wc_entry *e, *e2;
701 bool need_flush_after_free;
702
703 wc->uncommitted_blocks = 0;
704 del_timer(&wc->autocommit_timer);
705
706 if (list_empty(&wc->lru))
707 return;
708
709 e = container_of(wc->lru.next, struct wc_entry, lru);
710 if (writecache_entry_is_committed(wc, e)) {
711 if (wc->overwrote_committed) {
712 writecache_wait_for_ios(wc, WRITE);
713 writecache_disk_flush(wc, wc->ssd_dev);
714 wc->overwrote_committed = false;
715 }
716 return;
717 }
718 while (1) {
719 writecache_flush_entry(wc, e);
720 if (unlikely(e->lru.next == &wc->lru))
721 break;
722 e2 = container_of(e->lru.next, struct wc_entry, lru);
723 if (writecache_entry_is_committed(wc, e2))
724 break;
725 e = e2;
726 cond_resched();
727 }
728 writecache_commit_flushed(wc);
729
730 writecache_wait_for_ios(wc, WRITE);
731
732 wc->seq_count++;
733 pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
734 writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
735 writecache_commit_flushed(wc);
736
737 wc->overwrote_committed = false;
738
739 need_flush_after_free = false;
740 while (1) {
741 /* Free another committed entry with lower seq-count */
742 struct rb_node *rb_node = rb_prev(&e->rb_node);
743
744 if (rb_node) {
745 e2 = container_of(rb_node, struct wc_entry, rb_node);
746 if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
747 likely(!e2->write_in_progress)) {
748 writecache_free_entry(wc, e2);
749 need_flush_after_free = true;
750 }
751 }
752 if (unlikely(e->lru.prev == &wc->lru))
753 break;
754 e = container_of(e->lru.prev, struct wc_entry, lru);
755 cond_resched();
756 }
757
758 if (need_flush_after_free)
759 writecache_commit_flushed(wc);
760}
761
762static void writecache_flush_work(struct work_struct *work)
763{
764 struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
765
766 wc_lock(wc);
767 writecache_flush(wc);
768 wc_unlock(wc);
769}
770
771static void writecache_autocommit_timer(struct timer_list *t)
772{
773 struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
774 if (!writecache_has_error(wc))
775 queue_work(wc->writeback_wq, &wc->flush_work);
776}
777
778static void writecache_schedule_autocommit(struct dm_writecache *wc)
779{
780 if (!timer_pending(&wc->autocommit_timer))
781 mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
782}
783
784static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
785{
786 struct wc_entry *e;
787 bool discarded_something = false;
788
789 e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
790 if (unlikely(!e))
791 return;
792
793 while (read_original_sector(wc, e) < end) {
794 struct rb_node *node = rb_next(&e->rb_node);
795
796 if (likely(!e->write_in_progress)) {
797 if (!discarded_something) {
798 writecache_wait_for_ios(wc, READ);
799 writecache_wait_for_ios(wc, WRITE);
800 discarded_something = true;
801 }
802 writecache_free_entry(wc, e);
803 }
804
805 if (!node)
806 break;
807
808 e = container_of(node, struct wc_entry, rb_node);
809 }
810
811 if (discarded_something)
812 writecache_commit_flushed(wc);
813}
814
815static bool writecache_wait_for_writeback(struct dm_writecache *wc)
816{
817 if (wc->writeback_size) {
818 writecache_wait_on_freelist(wc);
819 return true;
820 }
821 return false;
822}
823
824static void writecache_suspend(struct dm_target *ti)
825{
826 struct dm_writecache *wc = ti->private;
827 bool flush_on_suspend;
828
829 del_timer_sync(&wc->autocommit_timer);
830
831 wc_lock(wc);
832 writecache_flush(wc);
833 flush_on_suspend = wc->flush_on_suspend;
834 if (flush_on_suspend) {
835 wc->flush_on_suspend = false;
836 wc->writeback_all++;
837 queue_work(wc->writeback_wq, &wc->writeback_work);
838 }
839 wc_unlock(wc);
840
841 flush_workqueue(wc->writeback_wq);
842
843 wc_lock(wc);
844 if (flush_on_suspend)
845 wc->writeback_all--;
846 while (writecache_wait_for_writeback(wc));
847
848 if (WC_MODE_PMEM(wc))
849 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
850
851 writecache_poison_lists(wc);
852
853 wc_unlock(wc);
854}
855
856static int writecache_alloc_entries(struct dm_writecache *wc)
857{
858 size_t b;
859
860 if (wc->entries)
861 return 0;
862 wc->entries = vmalloc(sizeof(struct wc_entry) * wc->n_blocks);
863 if (!wc->entries)
864 return -ENOMEM;
865 for (b = 0; b < wc->n_blocks; b++) {
866 struct wc_entry *e = &wc->entries[b];
867 e->index = b;
868 e->write_in_progress = false;
869 }
870
871 return 0;
872}
873
874static void writecache_resume(struct dm_target *ti)
875{
876 struct dm_writecache *wc = ti->private;
877 size_t b;
878 bool need_flush = false;
879 __le64 sb_seq_count;
880 int r;
881
882 wc_lock(wc);
883
884 if (WC_MODE_PMEM(wc))
885 persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
886
887 wc->tree = RB_ROOT;
888 INIT_LIST_HEAD(&wc->lru);
889 if (WC_MODE_SORT_FREELIST(wc)) {
890 wc->freetree = RB_ROOT;
891 wc->current_free = NULL;
892 } else {
893 INIT_LIST_HEAD(&wc->freelist);
894 }
895 wc->freelist_size = 0;
896
897 r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
898 if (r) {
899 writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
900 sb_seq_count = cpu_to_le64(0);
901 }
902 wc->seq_count = le64_to_cpu(sb_seq_count);
903
904#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
905 for (b = 0; b < wc->n_blocks; b++) {
906 struct wc_entry *e = &wc->entries[b];
907 struct wc_memory_entry wme;
908 if (writecache_has_error(wc)) {
909 e->original_sector = -1;
910 e->seq_count = -1;
911 continue;
912 }
913 r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry));
914 if (r) {
915 writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
916 (unsigned long)b, r);
917 e->original_sector = -1;
918 e->seq_count = -1;
919 } else {
920 e->original_sector = le64_to_cpu(wme.original_sector);
921 e->seq_count = le64_to_cpu(wme.seq_count);
922 }
923 }
924#endif
925 for (b = 0; b < wc->n_blocks; b++) {
926 struct wc_entry *e = &wc->entries[b];
927 if (!writecache_entry_is_committed(wc, e)) {
928 if (read_seq_count(wc, e) != -1) {
929erase_this:
930 clear_seq_count(wc, e);
931 need_flush = true;
932 }
933 writecache_add_to_freelist(wc, e);
934 } else {
935 struct wc_entry *old;
936
937 old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
938 if (!old) {
939 writecache_insert_entry(wc, e);
940 } else {
941 if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
942 writecache_error(wc, -EINVAL,
943 "two identical entries, position %llu, sector %llu, sequence %llu",
944 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
945 (unsigned long long)read_seq_count(wc, e));
946 }
947 if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
948 goto erase_this;
949 } else {
950 writecache_free_entry(wc, old);
951 writecache_insert_entry(wc, e);
952 need_flush = true;
953 }
954 }
955 }
956 cond_resched();
957 }
958
959 if (need_flush) {
960 writecache_flush_all_metadata(wc);
961 writecache_commit_flushed(wc);
962 }
963
964 wc_unlock(wc);
965}
966
967static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
968{
969 if (argc != 1)
970 return -EINVAL;
971
972 wc_lock(wc);
973 if (dm_suspended(wc->ti)) {
974 wc_unlock(wc);
975 return -EBUSY;
976 }
977 if (writecache_has_error(wc)) {
978 wc_unlock(wc);
979 return -EIO;
980 }
981
982 writecache_flush(wc);
983 wc->writeback_all++;
984 queue_work(wc->writeback_wq, &wc->writeback_work);
985 wc_unlock(wc);
986
987 flush_workqueue(wc->writeback_wq);
988
989 wc_lock(wc);
990 wc->writeback_all--;
991 if (writecache_has_error(wc)) {
992 wc_unlock(wc);
993 return -EIO;
994 }
995 wc_unlock(wc);
996
997 return 0;
998}
999
1000static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1001{
1002 if (argc != 1)
1003 return -EINVAL;
1004
1005 wc_lock(wc);
1006 wc->flush_on_suspend = true;
1007 wc_unlock(wc);
1008
1009 return 0;
1010}
1011
1012static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
1013 char *result, unsigned maxlen)
1014{
1015 int r = -EINVAL;
1016 struct dm_writecache *wc = ti->private;
1017
1018 if (!strcasecmp(argv[0], "flush"))
1019 r = process_flush_mesg(argc, argv, wc);
1020 else if (!strcasecmp(argv[0], "flush_on_suspend"))
1021 r = process_flush_on_suspend_mesg(argc, argv, wc);
1022 else
1023 DMERR("unrecognised message received: %s", argv[0]);
1024
1025 return r;
1026}
1027
1028static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
1029{
1030 void *buf;
1031 unsigned long flags;
1032 unsigned size;
1033 int rw = bio_data_dir(bio);
1034 unsigned remaining_size = wc->block_size;
1035
1036 do {
1037 struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
1038 buf = bvec_kmap_irq(&bv, &flags);
1039 size = bv.bv_len;
1040 if (unlikely(size > remaining_size))
1041 size = remaining_size;
1042
1043 if (rw == READ) {
1044 int r;
1045 r = memcpy_mcsafe(buf, data, size);
1046 flush_dcache_page(bio_page(bio));
1047 if (unlikely(r)) {
1048 writecache_error(wc, r, "hardware memory error when reading data: %d", r);
1049 bio->bi_status = BLK_STS_IOERR;
1050 }
1051 } else {
1052 flush_dcache_page(bio_page(bio));
1053 memcpy_flushcache(data, buf, size);
1054 }
1055
1056 bvec_kunmap_irq(buf, &flags);
1057
1058 data = (char *)data + size;
1059 remaining_size -= size;
1060 bio_advance(bio, size);
1061 } while (unlikely(remaining_size));
1062}
1063
1064static int writecache_flush_thread(void *data)
1065{
1066 struct dm_writecache *wc = data;
1067
1068 while (1) {
1069 struct bio *bio;
1070
1071 wc_lock(wc);
1072 bio = bio_list_pop(&wc->flush_list);
1073 if (!bio) {
1074 set_current_state(TASK_INTERRUPTIBLE);
1075 wc_unlock(wc);
1076
1077 if (unlikely(kthread_should_stop())) {
1078 set_current_state(TASK_RUNNING);
1079 break;
1080 }
1081
1082 schedule();
1083 continue;
1084 }
1085
1086 if (bio_op(bio) == REQ_OP_DISCARD) {
1087 writecache_discard(wc, bio->bi_iter.bi_sector,
1088 bio_end_sector(bio));
1089 wc_unlock(wc);
1090 bio_set_dev(bio, wc->dev->bdev);
1091 generic_make_request(bio);
1092 } else {
1093 writecache_flush(wc);
1094 wc_unlock(wc);
1095 if (writecache_has_error(wc))
1096 bio->bi_status = BLK_STS_IOERR;
1097 bio_endio(bio);
1098 }
1099 }
1100
1101 return 0;
1102}
1103
1104static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
1105{
1106 if (bio_list_empty(&wc->flush_list))
1107 wake_up_process(wc->flush_thread);
1108 bio_list_add(&wc->flush_list, bio);
1109}
1110
1111static int writecache_map(struct dm_target *ti, struct bio *bio)
1112{
1113 struct wc_entry *e;
1114 struct dm_writecache *wc = ti->private;
1115
1116 bio->bi_private = NULL;
1117
1118 wc_lock(wc);
1119
1120 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1121 if (writecache_has_error(wc))
1122 goto unlock_error;
1123 if (WC_MODE_PMEM(wc)) {
1124 writecache_flush(wc);
1125 if (writecache_has_error(wc))
1126 goto unlock_error;
1127 goto unlock_submit;
1128 } else {
1129 writecache_offload_bio(wc, bio);
1130 goto unlock_return;
1131 }
1132 }
1133
1134 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1135
1136 if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
1137 (wc->block_size / 512 - 1)) != 0)) {
1138 DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1139 (unsigned long long)bio->bi_iter.bi_sector,
1140 bio->bi_iter.bi_size, wc->block_size);
1141 goto unlock_error;
1142 }
1143
1144 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1145 if (writecache_has_error(wc))
1146 goto unlock_error;
1147 if (WC_MODE_PMEM(wc)) {
1148 writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
1149 goto unlock_remap_origin;
1150 } else {
1151 writecache_offload_bio(wc, bio);
1152 goto unlock_return;
1153 }
1154 }
1155
1156 if (bio_data_dir(bio) == READ) {
1157read_next_block:
1158 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1159 if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
1160 if (WC_MODE_PMEM(wc)) {
1161 bio_copy_block(wc, bio, memory_data(wc, e));
1162 if (bio->bi_iter.bi_size)
1163 goto read_next_block;
1164 goto unlock_submit;
1165 } else {
1166 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1167 bio_set_dev(bio, wc->ssd_dev->bdev);
1168 bio->bi_iter.bi_sector = cache_sector(wc, e);
1169 if (!writecache_entry_is_committed(wc, e))
1170 writecache_wait_for_ios(wc, WRITE);
1171 goto unlock_remap;
1172 }
1173 } else {
1174 if (e) {
1175 sector_t next_boundary =
1176 read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1177 if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
1178 dm_accept_partial_bio(bio, next_boundary);
1179 }
1180 }
1181 goto unlock_remap_origin;
1182 }
1183 } else {
1184 do {
1185 if (writecache_has_error(wc))
1186 goto unlock_error;
1187 e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
1188 if (e) {
1189 if (!writecache_entry_is_committed(wc, e))
1190 goto bio_copy;
1191 if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1192 wc->overwrote_committed = true;
1193 goto bio_copy;
1194 }
1195 }
1196 e = writecache_pop_from_freelist(wc);
1197 if (unlikely(!e)) {
1198 writecache_wait_on_freelist(wc);
1199 continue;
1200 }
1201 write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
1202 writecache_insert_entry(wc, e);
1203 wc->uncommitted_blocks++;
1204bio_copy:
1205 if (WC_MODE_PMEM(wc)) {
1206 bio_copy_block(wc, bio, memory_data(wc, e));
1207 } else {
1208 dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1209 bio_set_dev(bio, wc->ssd_dev->bdev);
1210 bio->bi_iter.bi_sector = cache_sector(wc, e);
1211 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
1212 wc->uncommitted_blocks = 0;
1213 queue_work(wc->writeback_wq, &wc->flush_work);
1214 } else {
1215 writecache_schedule_autocommit(wc);
1216 }
1217 goto unlock_remap;
1218 }
1219 } while (bio->bi_iter.bi_size);
1220
1221 if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks))
1222 writecache_flush(wc);
1223 else
1224 writecache_schedule_autocommit(wc);
1225 goto unlock_submit;
1226 }
1227
1228unlock_remap_origin:
1229 bio_set_dev(bio, wc->dev->bdev);
1230 wc_unlock(wc);
1231 return DM_MAPIO_REMAPPED;
1232
1233unlock_remap:
1234 /* make sure that writecache_end_io decrements bio_in_progress: */
1235 bio->bi_private = (void *)1;
1236 atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
1237 wc_unlock(wc);
1238 return DM_MAPIO_REMAPPED;
1239
1240unlock_submit:
1241 wc_unlock(wc);
1242 bio_endio(bio);
1243 return DM_MAPIO_SUBMITTED;
1244
1245unlock_return:
1246 wc_unlock(wc);
1247 return DM_MAPIO_SUBMITTED;
1248
1249unlock_error:
1250 wc_unlock(wc);
1251 bio_io_error(bio);
1252 return DM_MAPIO_SUBMITTED;
1253}
1254
1255static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
1256{
1257 struct dm_writecache *wc = ti->private;
1258
1259 if (bio->bi_private != NULL) {
1260 int dir = bio_data_dir(bio);
1261 if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
1262 if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
1263 wake_up(&wc->bio_in_progress_wait[dir]);
1264 }
1265 return 0;
1266}
1267
1268static int writecache_iterate_devices(struct dm_target *ti,
1269 iterate_devices_callout_fn fn, void *data)
1270{
1271 struct dm_writecache *wc = ti->private;
1272
1273 return fn(ti, wc->dev, 0, ti->len, data);
1274}
1275
1276static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
1277{
1278 struct dm_writecache *wc = ti->private;
1279
1280 if (limits->logical_block_size < wc->block_size)
1281 limits->logical_block_size = wc->block_size;
1282
1283 if (limits->physical_block_size < wc->block_size)
1284 limits->physical_block_size = wc->block_size;
1285
1286 if (limits->io_min < wc->block_size)
1287 limits->io_min = wc->block_size;
1288}
1289
1290
1291static void writecache_writeback_endio(struct bio *bio)
1292{
1293 struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
1294 struct dm_writecache *wc = wb->wc;
1295 unsigned long flags;
1296
1297 raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
1298 if (unlikely(list_empty(&wc->endio_list)))
1299 wake_up_process(wc->endio_thread);
1300 list_add_tail(&wb->endio_entry, &wc->endio_list);
1301 raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
1302}
1303
1304static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
1305{
1306 struct copy_struct *c = ptr;
1307 struct dm_writecache *wc = c->wc;
1308
1309 c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
1310
1311 raw_spin_lock_irq(&wc->endio_list_lock);
1312 if (unlikely(list_empty(&wc->endio_list)))
1313 wake_up_process(wc->endio_thread);
1314 list_add_tail(&c->endio_entry, &wc->endio_list);
1315 raw_spin_unlock_irq(&wc->endio_list_lock);
1316}
1317
1318static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
1319{
1320 unsigned i;
1321 struct writeback_struct *wb;
1322 struct wc_entry *e;
1323 unsigned long n_walked = 0;
1324
1325 do {
1326 wb = list_entry(list->next, struct writeback_struct, endio_entry);
1327 list_del(&wb->endio_entry);
1328
1329 if (unlikely(wb->bio.bi_status != BLK_STS_OK))
1330 writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
1331 "write error %d", wb->bio.bi_status);
1332 i = 0;
1333 do {
1334 e = wb->wc_list[i];
1335 BUG_ON(!e->write_in_progress);
1336 e->write_in_progress = false;
1337 INIT_LIST_HEAD(&e->lru);
1338 if (!writecache_has_error(wc))
1339 writecache_free_entry(wc, e);
1340 BUG_ON(!wc->writeback_size);
1341 wc->writeback_size--;
1342 n_walked++;
1343 if (unlikely(n_walked >= ENDIO_LATENCY)) {
1344 writecache_commit_flushed(wc);
1345 wc_unlock(wc);
1346 wc_lock(wc);
1347 n_walked = 0;
1348 }
1349 } while (++i < wb->wc_list_n);
1350
1351 if (wb->wc_list != wb->wc_list_inline)
1352 kfree(wb->wc_list);
1353 bio_put(&wb->bio);
1354 } while (!list_empty(list));
1355}
1356
1357static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
1358{
1359 struct copy_struct *c;
1360 struct wc_entry *e;
1361
1362 do {
1363 c = list_entry(list->next, struct copy_struct, endio_entry);
1364 list_del(&c->endio_entry);
1365
1366 if (unlikely(c->error))
1367 writecache_error(wc, c->error, "copy error");
1368
1369 e = c->e;
1370 do {
1371 BUG_ON(!e->write_in_progress);
1372 e->write_in_progress = false;
1373 INIT_LIST_HEAD(&e->lru);
1374 if (!writecache_has_error(wc))
1375 writecache_free_entry(wc, e);
1376
1377 BUG_ON(!wc->writeback_size);
1378 wc->writeback_size--;
1379 e++;
1380 } while (--c->n_entries);
1381 mempool_free(c, &wc->copy_pool);
1382 } while (!list_empty(list));
1383}
1384
1385static int writecache_endio_thread(void *data)
1386{
1387 struct dm_writecache *wc = data;
1388
1389 while (1) {
1390 struct list_head list;
1391
1392 raw_spin_lock_irq(&wc->endio_list_lock);
1393 if (!list_empty(&wc->endio_list))
1394 goto pop_from_list;
1395 set_current_state(TASK_INTERRUPTIBLE);
1396 raw_spin_unlock_irq(&wc->endio_list_lock);
1397
1398 if (unlikely(kthread_should_stop())) {
1399 set_current_state(TASK_RUNNING);
1400 break;
1401 }
1402
1403 schedule();
1404
1405 continue;
1406
1407pop_from_list:
1408 list = wc->endio_list;
1409 list.next->prev = list.prev->next = &list;
1410 INIT_LIST_HEAD(&wc->endio_list);
1411 raw_spin_unlock_irq(&wc->endio_list_lock);
1412
1413 if (!WC_MODE_FUA(wc))
1414 writecache_disk_flush(wc, wc->dev);
1415
1416 wc_lock(wc);
1417
1418 if (WC_MODE_PMEM(wc)) {
1419 __writecache_endio_pmem(wc, &list);
1420 } else {
1421 __writecache_endio_ssd(wc, &list);
1422 writecache_wait_for_ios(wc, READ);
1423 }
1424
1425 writecache_commit_flushed(wc);
1426
1427 wc_unlock(wc);
1428 }
1429
1430 return 0;
1431}
1432
1433static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp)
1434{
1435 struct dm_writecache *wc = wb->wc;
1436 unsigned block_size = wc->block_size;
1437 void *address = memory_data(wc, e);
1438
1439 persistent_memory_flush_cache(address, block_size);
1440 return bio_add_page(&wb->bio, persistent_memory_page(address),
1441 block_size, persistent_memory_page_offset(address)) != 0;
1442}
1443
1444struct writeback_list {
1445 struct list_head list;
1446 size_t size;
1447};
1448
1449static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
1450{
1451 if (unlikely(wc->max_writeback_jobs)) {
1452 if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
1453 wc_lock(wc);
1454 while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
1455 writecache_wait_on_freelist(wc);
1456 wc_unlock(wc);
1457 }
1458 }
1459 cond_resched();
1460}
1461
1462static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
1463{
1464 struct wc_entry *e, *f;
1465 struct bio *bio;
1466 struct writeback_struct *wb;
1467 unsigned max_pages;
1468
1469 while (wbl->size) {
1470 wbl->size--;
1471 e = container_of(wbl->list.prev, struct wc_entry, lru);
1472 list_del(&e->lru);
1473
1474 max_pages = e->wc_list_contiguous;
1475
1476 bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
1477 wb = container_of(bio, struct writeback_struct, bio);
1478 wb->wc = wc;
1479 wb->bio.bi_end_io = writecache_writeback_endio;
1480 bio_set_dev(&wb->bio, wc->dev->bdev);
1481 wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
1482 wb->page_offset = PAGE_SIZE;
1483 if (max_pages <= WB_LIST_INLINE ||
1484 unlikely(!(wb->wc_list = kmalloc(max_pages * sizeof(struct wc_entry *),
1485 GFP_NOIO | __GFP_NORETRY |
1486 __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
1487 wb->wc_list = wb->wc_list_inline;
1488 max_pages = WB_LIST_INLINE;
1489 }
1490
1491 BUG_ON(!wc_add_block(wb, e, GFP_NOIO));
1492
1493 wb->wc_list[0] = e;
1494 wb->wc_list_n = 1;
1495
1496 while (wbl->size && wb->wc_list_n < max_pages) {
1497 f = container_of(wbl->list.prev, struct wc_entry, lru);
1498 if (read_original_sector(wc, f) !=
1499 read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1500 break;
1501 if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN))
1502 break;
1503 wbl->size--;
1504 list_del(&f->lru);
1505 wb->wc_list[wb->wc_list_n++] = f;
1506 e = f;
1507 }
1508 bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
1509 if (writecache_has_error(wc)) {
1510 bio->bi_status = BLK_STS_IOERR;
1511 bio_endio(&wb->bio);
1512 } else {
1513 submit_bio(&wb->bio);
1514 }
1515
1516 __writeback_throttle(wc, wbl);
1517 }
1518}
1519
1520static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
1521{
1522 struct wc_entry *e, *f;
1523 struct dm_io_region from, to;
1524 struct copy_struct *c;
1525
1526 while (wbl->size) {
1527 unsigned n_sectors;
1528
1529 wbl->size--;
1530 e = container_of(wbl->list.prev, struct wc_entry, lru);
1531 list_del(&e->lru);
1532
1533 n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
1534
1535 from.bdev = wc->ssd_dev->bdev;
1536 from.sector = cache_sector(wc, e);
1537 from.count = n_sectors;
1538 to.bdev = wc->dev->bdev;
1539 to.sector = read_original_sector(wc, e);
1540 to.count = n_sectors;
1541
1542 c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
1543 c->wc = wc;
1544 c->e = e;
1545 c->n_entries = e->wc_list_contiguous;
1546
1547 while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
1548 wbl->size--;
1549 f = container_of(wbl->list.prev, struct wc_entry, lru);
1550 BUG_ON(f != e + 1);
1551 list_del(&f->lru);
1552 e = f;
1553 }
1554
1555 dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
1556
1557 __writeback_throttle(wc, wbl);
1558 }
1559}
1560
1561static void writecache_writeback(struct work_struct *work)
1562{
1563 struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
1564 struct blk_plug plug;
1565 struct wc_entry *e, *f, *g;
1566 struct rb_node *node, *next_node;
1567 struct list_head skipped;
1568 struct writeback_list wbl;
1569 unsigned long n_walked;
1570
1571 wc_lock(wc);
1572restart:
1573 if (writecache_has_error(wc)) {
1574 wc_unlock(wc);
1575 return;
1576 }
1577
1578 if (unlikely(wc->writeback_all)) {
1579 if (writecache_wait_for_writeback(wc))
1580 goto restart;
1581 }
1582
1583 if (wc->overwrote_committed) {
1584 writecache_wait_for_ios(wc, WRITE);
1585 }
1586
1587 n_walked = 0;
1588 INIT_LIST_HEAD(&skipped);
1589 INIT_LIST_HEAD(&wbl.list);
1590 wbl.size = 0;
1591 while (!list_empty(&wc->lru) &&
1592 (wc->writeback_all ||
1593 wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) {
1594
1595 n_walked++;
1596 if (unlikely(n_walked > WRITEBACK_LATENCY) &&
1597 likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) {
1598 queue_work(wc->writeback_wq, &wc->writeback_work);
1599 break;
1600 }
1601
1602 e = container_of(wc->lru.prev, struct wc_entry, lru);
1603 BUG_ON(e->write_in_progress);
1604 if (unlikely(!writecache_entry_is_committed(wc, e))) {
1605 writecache_flush(wc);
1606 }
1607 node = rb_prev(&e->rb_node);
1608 if (node) {
1609 f = container_of(node, struct wc_entry, rb_node);
1610 if (unlikely(read_original_sector(wc, f) ==
1611 read_original_sector(wc, e))) {
1612 BUG_ON(!f->write_in_progress);
1613 list_del(&e->lru);
1614 list_add(&e->lru, &skipped);
1615 cond_resched();
1616 continue;
1617 }
1618 }
1619 wc->writeback_size++;
1620 list_del(&e->lru);
1621 list_add(&e->lru, &wbl.list);
1622 wbl.size++;
1623 e->write_in_progress = true;
1624 e->wc_list_contiguous = 1;
1625
1626 f = e;
1627
1628 while (1) {
1629 next_node = rb_next(&f->rb_node);
1630 if (unlikely(!next_node))
1631 break;
1632 g = container_of(next_node, struct wc_entry, rb_node);
1633 if (read_original_sector(wc, g) ==
1634 read_original_sector(wc, f)) {
1635 f = g;
1636 continue;
1637 }
1638 if (read_original_sector(wc, g) !=
1639 read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
1640 break;
1641 if (unlikely(g->write_in_progress))
1642 break;
1643 if (unlikely(!writecache_entry_is_committed(wc, g)))
1644 break;
1645
1646 if (!WC_MODE_PMEM(wc)) {
1647 if (g != f + 1)
1648 break;
1649 }
1650
1651 n_walked++;
1652 //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
1653 // break;
1654
1655 wc->writeback_size++;
1656 list_del(&g->lru);
1657 list_add(&g->lru, &wbl.list);
1658 wbl.size++;
1659 g->write_in_progress = true;
1660 g->wc_list_contiguous = BIO_MAX_PAGES;
1661 f = g;
1662 e->wc_list_contiguous++;
1663 if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES))
1664 break;
1665 }
1666 cond_resched();
1667 }
1668
1669 if (!list_empty(&skipped)) {
1670 list_splice_tail(&skipped, &wc->lru);
1671 /*
1672 * If we didn't do any progress, we must wait until some
1673 * writeback finishes to avoid burning CPU in a loop
1674 */
1675 if (unlikely(!wbl.size))
1676 writecache_wait_for_writeback(wc);
1677 }
1678
1679 wc_unlock(wc);
1680
1681 blk_start_plug(&plug);
1682
1683 if (WC_MODE_PMEM(wc))
1684 __writecache_writeback_pmem(wc, &wbl);
1685 else
1686 __writecache_writeback_ssd(wc, &wbl);
1687
1688 blk_finish_plug(&plug);
1689
1690 if (unlikely(wc->writeback_all)) {
1691 wc_lock(wc);
1692 while (writecache_wait_for_writeback(wc));
1693 wc_unlock(wc);
1694 }
1695}
1696
1697static int calculate_memory_size(uint64_t device_size, unsigned block_size,
1698 size_t *n_blocks_p, size_t *n_metadata_blocks_p)
1699{
1700 uint64_t n_blocks, offset;
1701 struct wc_entry e;
1702
1703 n_blocks = device_size;
1704 do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
1705
1706 while (1) {
1707 if (!n_blocks)
1708 return -ENOSPC;
1709 /* Verify the following entries[n_blocks] won't overflow */
1710 if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
1711 sizeof(struct wc_memory_entry)))
1712 return -EFBIG;
1713 offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
1714 offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
1715 if (offset + n_blocks * block_size <= device_size)
1716 break;
1717 n_blocks--;
1718 }
1719
1720 /* check if the bit field overflows */
1721 e.index = n_blocks;
1722 if (e.index != n_blocks)
1723 return -EFBIG;
1724
1725 if (n_blocks_p)
1726 *n_blocks_p = n_blocks;
1727 if (n_metadata_blocks_p)
1728 *n_metadata_blocks_p = offset >> __ffs(block_size);
1729 return 0;
1730}
1731
1732static int init_memory(struct dm_writecache *wc)
1733{
1734 size_t b;
1735 int r;
1736
1737 r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
1738 if (r)
1739 return r;
1740
1741 r = writecache_alloc_entries(wc);
1742 if (r)
1743 return r;
1744
1745 for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
1746 pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
1747 pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
1748 pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
1749 pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
1750 pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
1751
1752 for (b = 0; b < wc->n_blocks; b++)
1753 write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
1754
1755 writecache_flush_all_metadata(wc);
1756 writecache_commit_flushed(wc);
1757 pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
1758 writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
1759 writecache_commit_flushed(wc);
1760
1761 return 0;
1762}
1763
1764static void writecache_dtr(struct dm_target *ti)
1765{
1766 struct dm_writecache *wc = ti->private;
1767
1768 if (!wc)
1769 return;
1770
1771 if (wc->endio_thread)
1772 kthread_stop(wc->endio_thread);
1773
1774 if (wc->flush_thread)
1775 kthread_stop(wc->flush_thread);
1776
1777 bioset_exit(&wc->bio_set);
1778
1779 mempool_exit(&wc->copy_pool);
1780
1781 if (wc->writeback_wq)
1782 destroy_workqueue(wc->writeback_wq);
1783
1784 if (wc->dev)
1785 dm_put_device(ti, wc->dev);
1786
1787 if (wc->ssd_dev)
1788 dm_put_device(ti, wc->ssd_dev);
1789
1790 if (wc->entries)
1791 vfree(wc->entries);
1792
1793 if (wc->memory_map) {
1794 if (WC_MODE_PMEM(wc))
1795 persistent_memory_release(wc);
1796 else
1797 vfree(wc->memory_map);
1798 }
1799
1800 if (wc->dm_kcopyd)
1801 dm_kcopyd_client_destroy(wc->dm_kcopyd);
1802
1803 if (wc->dm_io)
1804 dm_io_client_destroy(wc->dm_io);
1805
1806 if (wc->dirty_bitmap)
1807 vfree(wc->dirty_bitmap);
1808
1809 kfree(wc);
1810}
1811
1812static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
1813{
1814 struct dm_writecache *wc;
1815 struct dm_arg_set as;
1816 const char *string;
1817 unsigned opt_params;
1818 size_t offset, data_size;
1819 int i, r;
1820 char dummy;
1821 int high_wm_percent = HIGH_WATERMARK;
1822 int low_wm_percent = LOW_WATERMARK;
1823 uint64_t x;
1824 struct wc_memory_superblock s;
1825
1826 static struct dm_arg _args[] = {
1827 {0, 10, "Invalid number of feature args"},
1828 };
1829
1830 as.argc = argc;
1831 as.argv = argv;
1832
1833 wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
1834 if (!wc) {
1835 ti->error = "Cannot allocate writecache structure";
1836 r = -ENOMEM;
1837 goto bad;
1838 }
1839 ti->private = wc;
1840 wc->ti = ti;
1841
1842 mutex_init(&wc->lock);
1843 writecache_poison_lists(wc);
1844 init_waitqueue_head(&wc->freelist_wait);
1845 timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
1846
1847 for (i = 0; i < 2; i++) {
1848 atomic_set(&wc->bio_in_progress[i], 0);
1849 init_waitqueue_head(&wc->bio_in_progress_wait[i]);
1850 }
1851
1852 wc->dm_io = dm_io_client_create();
1853 if (IS_ERR(wc->dm_io)) {
1854 r = PTR_ERR(wc->dm_io);
1855 ti->error = "Unable to allocate dm-io client";
1856 wc->dm_io = NULL;
1857 goto bad;
1858 }
1859
1860 wc->writeback_wq = alloc_workqueue("writecache-writeabck", WQ_MEM_RECLAIM, 1);
1861 if (!wc->writeback_wq) {
1862 r = -ENOMEM;
1863 ti->error = "Could not allocate writeback workqueue";
1864 goto bad;
1865 }
1866 INIT_WORK(&wc->writeback_work, writecache_writeback);
1867 INIT_WORK(&wc->flush_work, writecache_flush_work);
1868
1869 raw_spin_lock_init(&wc->endio_list_lock);
1870 INIT_LIST_HEAD(&wc->endio_list);
1871 wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
1872 if (IS_ERR(wc->endio_thread)) {
1873 r = PTR_ERR(wc->endio_thread);
1874 wc->endio_thread = NULL;
1875 ti->error = "Couldn't spawn endio thread";
1876 goto bad;
1877 }
1878 wake_up_process(wc->endio_thread);
1879
1880 /*
1881 * Parse the mode (pmem or ssd)
1882 */
1883 string = dm_shift_arg(&as);
1884 if (!string)
1885 goto bad_arguments;
1886
1887 if (!strcasecmp(string, "s")) {
1888 wc->pmem_mode = false;
1889 } else if (!strcasecmp(string, "p")) {
1890#ifdef DM_WRITECACHE_HAS_PMEM
1891 wc->pmem_mode = true;
1892 wc->writeback_fua = true;
1893#else
1894 /*
1895 * If the architecture doesn't support persistent memory or
1896 * the kernel doesn't support any DAX drivers, this driver can
1897 * only be used in SSD-only mode.
1898 */
1899 r = -EOPNOTSUPP;
1900 ti->error = "Persistent memory or DAX not supported on this system";
1901 goto bad;
1902#endif
1903 } else {
1904 goto bad_arguments;
1905 }
1906
1907 if (WC_MODE_PMEM(wc)) {
1908 r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
1909 offsetof(struct writeback_struct, bio),
1910 BIOSET_NEED_BVECS);
1911 if (r) {
1912 ti->error = "Could not allocate bio set";
1913 goto bad;
1914 }
1915 } else {
1916 r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
1917 if (r) {
1918 ti->error = "Could not allocate mempool";
1919 goto bad;
1920 }
1921 }
1922
1923 /*
1924 * Parse the origin data device
1925 */
1926 string = dm_shift_arg(&as);
1927 if (!string)
1928 goto bad_arguments;
1929 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
1930 if (r) {
1931 ti->error = "Origin data device lookup failed";
1932 goto bad;
1933 }
1934
1935 /*
1936 * Parse cache data device (be it pmem or ssd)
1937 */
1938 string = dm_shift_arg(&as);
1939 if (!string)
1940 goto bad_arguments;
1941
1942 r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
1943 if (r) {
1944 ti->error = "Cache data device lookup failed";
1945 goto bad;
1946 }
1947 wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
1948
1949 if (WC_MODE_PMEM(wc)) {
1950 r = persistent_memory_claim(wc);
1951 if (r) {
1952 ti->error = "Unable to map persistent memory for cache";
1953 goto bad;
1954 }
1955 }
1956
1957 /*
1958 * Parse the cache block size
1959 */
1960 string = dm_shift_arg(&as);
1961 if (!string)
1962 goto bad_arguments;
1963 if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
1964 wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
1965 (wc->block_size & (wc->block_size - 1))) {
1966 r = -EINVAL;
1967 ti->error = "Invalid block size";
1968 goto bad;
1969 }
1970 wc->block_size_bits = __ffs(wc->block_size);
1971
1972 wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
1973 wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
1974 wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
1975
1976 /*
1977 * Parse optional arguments
1978 */
1979 r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
1980 if (r)
1981 goto bad;
1982
1983 while (opt_params) {
1984 string = dm_shift_arg(&as), opt_params--;
1985 if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
1986 string = dm_shift_arg(&as), opt_params--;
1987 if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
1988 goto invalid_optional;
1989 if (high_wm_percent < 0 || high_wm_percent > 100)
1990 goto invalid_optional;
1991 wc->high_wm_percent_set = true;
1992 } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
1993 string = dm_shift_arg(&as), opt_params--;
1994 if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
1995 goto invalid_optional;
1996 if (low_wm_percent < 0 || low_wm_percent > 100)
1997 goto invalid_optional;
1998 wc->low_wm_percent_set = true;
1999 } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
2000 string = dm_shift_arg(&as), opt_params--;
2001 if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
2002 goto invalid_optional;
2003 wc->max_writeback_jobs_set = true;
2004 } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
2005 string = dm_shift_arg(&as), opt_params--;
2006 if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
2007 goto invalid_optional;
2008 wc->autocommit_blocks_set = true;
2009 } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
2010 unsigned autocommit_msecs;
2011 string = dm_shift_arg(&as), opt_params--;
2012 if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
2013 goto invalid_optional;
2014 if (autocommit_msecs > 3600000)
2015 goto invalid_optional;
2016 wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
2017 wc->autocommit_time_set = true;
2018 } else if (!strcasecmp(string, "fua")) {
2019 if (WC_MODE_PMEM(wc)) {
2020 wc->writeback_fua = true;
2021 wc->writeback_fua_set = true;
2022 } else goto invalid_optional;
2023 } else if (!strcasecmp(string, "nofua")) {
2024 if (WC_MODE_PMEM(wc)) {
2025 wc->writeback_fua = false;
2026 wc->writeback_fua_set = true;
2027 } else goto invalid_optional;
2028 } else {
2029invalid_optional:
2030 r = -EINVAL;
2031 ti->error = "Invalid optional argument";
2032 goto bad;
2033 }
2034 }
2035
2036 if (high_wm_percent < low_wm_percent) {
2037 r = -EINVAL;
2038 ti->error = "High watermark must be greater than or equal to low watermark";
2039 goto bad;
2040 }
2041
2042 if (!WC_MODE_PMEM(wc)) {
2043 struct dm_io_region region;
2044 struct dm_io_request req;
2045 size_t n_blocks, n_metadata_blocks;
2046 uint64_t n_bitmap_bits;
2047
2048 bio_list_init(&wc->flush_list);
2049 wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush");
2050 if (IS_ERR(wc->flush_thread)) {
2051 r = PTR_ERR(wc->flush_thread);
2052 wc->flush_thread = NULL;
2053 ti->error = "Couldn't spawn endio thread";
2054 goto bad;
2055 }
2056 wake_up_process(wc->flush_thread);
2057
2058 r = calculate_memory_size(wc->memory_map_size, wc->block_size,
2059 &n_blocks, &n_metadata_blocks);
2060 if (r) {
2061 ti->error = "Invalid device size";
2062 goto bad;
2063 }
2064
2065 n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
2066 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
2067 /* this is limitation of test_bit functions */
2068 if (n_bitmap_bits > 1U << 31) {
2069 r = -EFBIG;
2070 ti->error = "Invalid device size";
2071 goto bad;
2072 }
2073
2074 wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
2075 if (!wc->memory_map) {
2076 r = -ENOMEM;
2077 ti->error = "Unable to allocate memory for metadata";
2078 goto bad;
2079 }
2080
2081 wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2082 if (IS_ERR(wc->dm_kcopyd)) {
2083 r = PTR_ERR(wc->dm_kcopyd);
2084 ti->error = "Unable to allocate dm-kcopyd client";
2085 wc->dm_kcopyd = NULL;
2086 goto bad;
2087 }
2088
2089 wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
2090 wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
2091 BITS_PER_LONG * sizeof(unsigned long);
2092 wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
2093 if (!wc->dirty_bitmap) {
2094 r = -ENOMEM;
2095 ti->error = "Unable to allocate dirty bitmap";
2096 goto bad;
2097 }
2098
2099 region.bdev = wc->ssd_dev->bdev;
2100 region.sector = 0;
2101 region.count = wc->metadata_sectors;
2102 req.bi_op = REQ_OP_READ;
2103 req.bi_op_flags = REQ_SYNC;
2104 req.mem.type = DM_IO_VMA;
2105 req.mem.ptr.vma = (char *)wc->memory_map;
2106 req.client = wc->dm_io;
2107 req.notify.fn = NULL;
2108
2109 r = dm_io(&req, 1, &region, NULL);
2110 if (r) {
2111 ti->error = "Unable to read metadata";
2112 goto bad;
2113 }
2114 }
2115
2116 r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2117 if (r) {
2118 ti->error = "Hardware memory error when reading superblock";
2119 goto bad;
2120 }
2121 if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
2122 r = init_memory(wc);
2123 if (r) {
2124 ti->error = "Unable to initialize device";
2125 goto bad;
2126 }
2127 r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
2128 if (r) {
2129 ti->error = "Hardware memory error when reading superblock";
2130 goto bad;
2131 }
2132 }
2133
2134 if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
2135 ti->error = "Invalid magic in the superblock";
2136 r = -EINVAL;
2137 goto bad;
2138 }
2139
2140 if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
2141 ti->error = "Invalid version in the superblock";
2142 r = -EINVAL;
2143 goto bad;
2144 }
2145
2146 if (le32_to_cpu(s.block_size) != wc->block_size) {
2147 ti->error = "Block size does not match superblock";
2148 r = -EINVAL;
2149 goto bad;
2150 }
2151
2152 wc->n_blocks = le64_to_cpu(s.n_blocks);
2153
2154 offset = wc->n_blocks * sizeof(struct wc_memory_entry);
2155 if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
2156overflow:
2157 ti->error = "Overflow in size calculation";
2158 r = -EINVAL;
2159 goto bad;
2160 }
2161 offset += sizeof(struct wc_memory_superblock);
2162 if (offset < sizeof(struct wc_memory_superblock))
2163 goto overflow;
2164 offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
2165 data_size = wc->n_blocks * (size_t)wc->block_size;
2166 if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
2167 (offset + data_size < offset))
2168 goto overflow;
2169 if (offset + data_size > wc->memory_map_size) {
2170 ti->error = "Memory area is too small";
2171 r = -EINVAL;
2172 goto bad;
2173 }
2174
2175 wc->metadata_sectors = offset >> SECTOR_SHIFT;
2176 wc->block_start = (char *)sb(wc) + offset;
2177
2178 x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
2179 x += 50;
2180 do_div(x, 100);
2181 wc->freelist_high_watermark = x;
2182 x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
2183 x += 50;
2184 do_div(x, 100);
2185 wc->freelist_low_watermark = x;
2186
2187 r = writecache_alloc_entries(wc);
2188 if (r) {
2189 ti->error = "Cannot allocate memory";
2190 goto bad;
2191 }
2192
2193 ti->num_flush_bios = 1;
2194 ti->flush_supported = true;
2195 ti->num_discard_bios = 1;
2196
2197 if (WC_MODE_PMEM(wc))
2198 persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
2199
2200 return 0;
2201
2202bad_arguments:
2203 r = -EINVAL;
2204 ti->error = "Bad arguments";
2205bad:
2206 writecache_dtr(ti);
2207 return r;
2208}
2209
2210static void writecache_status(struct dm_target *ti, status_type_t type,
2211 unsigned status_flags, char *result, unsigned maxlen)
2212{
2213 struct dm_writecache *wc = ti->private;
2214 unsigned extra_args;
2215 unsigned sz = 0;
2216 uint64_t x;
2217
2218 switch (type) {
2219 case STATUSTYPE_INFO:
2220 DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc),
2221 (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
2222 (unsigned long long)wc->writeback_size);
2223 break;
2224 case STATUSTYPE_TABLE:
2225 DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
2226 wc->dev->name, wc->ssd_dev->name, wc->block_size);
2227 extra_args = 0;
2228 if (wc->high_wm_percent_set)
2229 extra_args += 2;
2230 if (wc->low_wm_percent_set)
2231 extra_args += 2;
2232 if (wc->max_writeback_jobs_set)
2233 extra_args += 2;
2234 if (wc->autocommit_blocks_set)
2235 extra_args += 2;
2236 if (wc->autocommit_time_set)
2237 extra_args += 2;
2238 if (wc->writeback_fua_set)
2239 extra_args++;
2240
2241 DMEMIT("%u", extra_args);
2242 if (wc->high_wm_percent_set) {
2243 x = (uint64_t)wc->freelist_high_watermark * 100;
2244 x += wc->n_blocks / 2;
2245 do_div(x, (size_t)wc->n_blocks);
2246 DMEMIT(" high_watermark %u", 100 - (unsigned)x);
2247 }
2248 if (wc->low_wm_percent_set) {
2249 x = (uint64_t)wc->freelist_low_watermark * 100;
2250 x += wc->n_blocks / 2;
2251 do_div(x, (size_t)wc->n_blocks);
2252 DMEMIT(" low_watermark %u", 100 - (unsigned)x);
2253 }
2254 if (wc->max_writeback_jobs_set)
2255 DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
2256 if (wc->autocommit_blocks_set)
2257 DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
2258 if (wc->autocommit_time_set)
2259 DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
2260 if (wc->writeback_fua_set)
2261 DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
2262 break;
2263 }
2264}
2265
2266static struct target_type writecache_target = {
2267 .name = "writecache",
2268 .version = {1, 0, 0},
2269 .module = THIS_MODULE,
2270 .ctr = writecache_ctr,
2271 .dtr = writecache_dtr,
2272 .status = writecache_status,
2273 .postsuspend = writecache_suspend,
2274 .resume = writecache_resume,
2275 .message = writecache_message,
2276 .map = writecache_map,
2277 .end_io = writecache_end_io,
2278 .iterate_devices = writecache_iterate_devices,
2279 .io_hints = writecache_io_hints,
2280};
2281
2282static int __init dm_writecache_init(void)
2283{
2284 int r;
2285
2286 r = dm_register_target(&writecache_target);
2287 if (r < 0) {
2288 DMERR("register failed %d", r);
2289 return r;
2290 }
2291
2292 return 0;
2293}
2294
2295static void __exit dm_writecache_exit(void)
2296{
2297 dm_unregister_target(&writecache_target);
2298}
2299
2300module_init(dm_writecache_init);
2301module_exit(dm_writecache_exit);
2302
2303MODULE_DESCRIPTION(DM_NAME " writecache target");
2304MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2305MODULE_LICENSE("GPL");