diff options
author | Josef Bacik <jbacik@fb.com> | 2015-03-20 10:50:37 -0400 |
---|---|---|
committer | Mike Snitzer <snitzer@redhat.com> | 2015-04-15 12:10:24 -0400 |
commit | 0e9cebe724597a76ab1b0ebc0a21e16f7db11b47 (patch) | |
tree | 8217c6cea86697a985a7f88ff4240bb93277db84 | |
parent | 7f61f5a022101e0c38c3cff2ef9ace9c9c86dbfb (diff) |
dm: add log writes target
Introduce a new target that is meant for file system developers to test file
system integrity at particular points in the life of a file system. We capture
all write requests and associated data and log them to a separate device
for later replay. There is a userspace utility to do this replay. The
idea behind this is to give file system developers a tool to verify that
the file system is always consistent.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Zach Brown <zab@zabbo.net>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
-rw-r--r-- | Documentation/device-mapper/log-writes.txt | 140 | ||||
-rw-r--r-- | drivers/md/Kconfig | 16 | ||||
-rw-r--r-- | drivers/md/Makefile | 1 | ||||
-rw-r--r-- | drivers/md/dm-log-writes.c | 825 |
4 files changed, 982 insertions, 0 deletions
diff --git a/Documentation/device-mapper/log-writes.txt b/Documentation/device-mapper/log-writes.txt new file mode 100644 index 000000000000..c10f30c9b534 --- /dev/null +++ b/Documentation/device-mapper/log-writes.txt | |||
@@ -0,0 +1,140 @@ | |||
1 | dm-log-writes | ||
2 | ============= | ||
3 | |||
4 | This target takes 2 devices, one to pass all IO to normally, and one to log all | ||
5 | of the write operations to. This is intended for file system developers wishing | ||
6 | to verify the integrity of metadata or data as the file system is written to. | ||
7 | There is a log_write_entry written for every WRITE request and the target is | ||
8 | able to take arbitrary data from userspace to insert into the log. The data | ||
9 | that is in the WRITE requests is copied into the log to make the replay happen | ||
10 | exactly as it happened originally. | ||
11 | |||
12 | Log Ordering | ||
13 | ============ | ||
14 | |||
15 | We log things in order of completion once we are sure the write is no longer in | ||
16 | cache. This means that normal WRITE requests are not actually logged until the | ||
17 | next REQ_FLUSH request. This is to make it easier for userspace to replay the | ||
18 | log in a way that correlates to what is on disk and not what is in cache, to | ||
19 | make it easier to detect improper waiting/flushing. | ||
20 | |||
21 | This works by attaching all WRITE requests to a list once the write completes. | ||
22 | Once we see a REQ_FLUSH request we splice this list onto the request and once | ||
23 | the FLUSH request completes we log all of the WRITEs and then the FLUSH. Only | ||
24 | completed WRITEs, at the time the REQ_FLUSH is issued, are added in order to | ||
25 | simulate the worst case scenario with regard to power failures. Consider the | ||
26 | following example (W means write, C means complete): | ||
27 | |||
28 | W1,W2,W3,C3,C2,Wflush,C1,Cflush | ||
29 | |||
30 | The log would show the following | ||
31 | |||
32 | W3,W2,flush,W1.... | ||
33 | |||
34 | Again this is to simulate what is actually on disk, this allows us to detect | ||
35 | cases where a power failure at a particular point in time would create an | ||
36 | inconsistent file system. | ||
37 | |||
38 | Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as | ||
39 | they complete as those requests will obviously bypass the device cache. | ||
40 | |||
41 | Any REQ_DISCARD requests are treated like WRITE requests. Otherwise we would | ||
42 | have all the DISCARD requests, and then the WRITE requests and then the FLUSH | ||
43 | request. Consider the following example: | ||
44 | |||
45 | WRITE block 1, DISCARD block 1, FLUSH | ||
46 | |||
47 | If we logged DISCARD when it completed, the replay would look like this | ||
48 | |||
49 | DISCARD 1, WRITE 1, FLUSH | ||
50 | |||
51 | which isn't quite what happened and wouldn't be caught during the log replay. | ||
52 | |||
53 | Target interface | ||
54 | ================ | ||
55 | |||
56 | i) Constructor | ||
57 | |||
58 | log-writes <dev_path> <log_dev_path> | ||
59 | |||
60 | dev_path : Device that all of the IO will go to normally. | ||
61 | log_dev_path : Device where the log entries are written to. | ||
62 | |||
63 | ii) Status | ||
64 | |||
65 | <#logged entries> <highest allocated sector> | ||
66 | |||
67 | #logged entries : Number of logged entries | ||
68 | highest allocated sector : Highest allocated sector | ||
69 | |||
70 | iii) Messages | ||
71 | |||
72 | mark <description> | ||
73 | |||
74 | You can use a dmsetup message to set an arbitrary mark in a log. | ||
75 | For example say you want to fsck a file system after every | ||
76 | write, but first you need to replay up to the mkfs to make sure | ||
77 | we're fsck'ing something reasonable, you would do something like | ||
78 | this: | ||
79 | |||
80 | mkfs.btrfs -f /dev/mapper/log | ||
81 | dmsetup message log 0 mark mkfs | ||
82 | <run test> | ||
83 | |||
84 | This would allow you to replay the log up to the mkfs mark and | ||
85 | then replay from that point on doing the fsck check in the | ||
86 | interval that you want. | ||
87 | |||
88 | Every log has a mark at the end labeled "dm-log-writes-end". | ||
89 | |||
90 | Userspace component | ||
91 | =================== | ||
92 | |||
93 | There is a userspace tool that will replay the log for you in various ways. | ||
94 | It can be found here: https://github.com/josefbacik/log-writes | ||
95 | |||
96 | Example usage | ||
97 | ============= | ||
98 | |||
99 | Say you want to test fsync on your file system. You would do something like | ||
100 | this: | ||
101 | |||
102 | TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" | ||
103 | dmsetup create log --table "$TABLE" | ||
104 | mkfs.btrfs -f /dev/mapper/log | ||
105 | dmsetup message log 0 mark mkfs | ||
106 | |||
107 | mount /dev/mapper/log /mnt/btrfs-test | ||
108 | <some test that does fsync at the end> | ||
109 | dmsetup message log 0 mark fsync | ||
110 | md5sum /mnt/btrfs-test/foo | ||
111 | umount /mnt/btrfs-test | ||
112 | |||
113 | dmsetup remove log | ||
114 | replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync | ||
115 | mount /dev/sdb /mnt/btrfs-test | ||
116 | md5sum /mnt/btrfs-test/foo | ||
117 | <verify md5sum's are correct> | ||
118 | |||
119 | Another option is to do a complicated file system operation and verify the file | ||
120 | system is consistent during the entire operation. You could do this with: | ||
121 | |||
122 | TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" | ||
123 | dmsetup create log --table "$TABLE" | ||
124 | mkfs.btrfs -f /dev/mapper/log | ||
125 | dmsetup message log 0 mark mkfs | ||
126 | |||
127 | mount /dev/mapper/log /mnt/btrfs-test | ||
128 | <fsstress to dirty the fs> | ||
129 | btrfs filesystem balance /mnt/btrfs-test | ||
130 | umount /mnt/btrfs-test | ||
131 | dmsetup remove log | ||
132 | |||
133 | replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs | ||
134 | btrfsck /dev/sdb | ||
135 | replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \ | ||
136 | --fsck "btrfsck /dev/sdb" --check fua | ||
137 | |||
138 | And that will replay the log until it sees a FUA request, run the fsck command | ||
139 | and if the fsck passes it will replay to the next FUA, until it is completed or | ||
140 | the fsck command exists abnormally. | ||
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 109f9dcc9cab..6ddc983417d5 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -443,4 +443,20 @@ config DM_SWITCH | |||
443 | 443 | ||
444 | If unsure, say N. | 444 | If unsure, say N. |
445 | 445 | ||
446 | config DM_LOG_WRITES | ||
447 | tristate "Log writes target support" | ||
448 | depends on BLK_DEV_DM | ||
449 | ---help--- | ||
450 | This device-mapper target takes two devices, one device to use | ||
451 | normally, one to log all write operations done to the first device. | ||
452 | This is for use by file system developers wishing to verify that | ||
453 | their fs is writing a consitent file system at all times by allowing | ||
454 | them to replay the log in a variety of ways and to check the | ||
455 | contents. | ||
456 | |||
457 | To compile this code as a module, choose M here: the module will | ||
458 | be called dm-log-writes. | ||
459 | |||
460 | If unsure, say N. | ||
461 | |||
446 | endif # MD | 462 | endif # MD |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index a2da532b1c2b..1863feaa5846 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -55,6 +55,7 @@ obj-$(CONFIG_DM_CACHE) += dm-cache.o | |||
55 | obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o | 55 | obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o |
56 | obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o | 56 | obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o |
57 | obj-$(CONFIG_DM_ERA) += dm-era.o | 57 | obj-$(CONFIG_DM_ERA) += dm-era.o |
58 | obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o | ||
58 | 59 | ||
59 | ifeq ($(CONFIG_DM_UEVENT),y) | 60 | ifeq ($(CONFIG_DM_UEVENT),y) |
60 | dm-mod-objs += dm-uevent.o | 61 | dm-mod-objs += dm-uevent.o |
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c new file mode 100644 index 000000000000..93e08446a87d --- /dev/null +++ b/drivers/md/dm-log-writes.c | |||
@@ -0,0 +1,825 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2014 Facebook. All rights reserved. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #include <linux/device-mapper.h> | ||
8 | |||
9 | #include <linux/module.h> | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/blkdev.h> | ||
12 | #include <linux/bio.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include <linux/kthread.h> | ||
15 | #include <linux/freezer.h> | ||
16 | |||
17 | #define DM_MSG_PREFIX "log-writes" | ||
18 | |||
19 | /* | ||
20 | * This target will sequentially log all writes to the target device onto the | ||
21 | * log device. This is helpful for replaying writes to check for fs consistency | ||
22 | * at all times. This target provides a mechanism to mark specific events to | ||
23 | * check data at a later time. So for example you would: | ||
24 | * | ||
25 | * write data | ||
26 | * fsync | ||
27 | * dmsetup message /dev/whatever mark mymark | ||
28 | * unmount /mnt/test | ||
29 | * | ||
30 | * Then replay the log up to mymark and check the contents of the replay to | ||
31 | * verify it matches what was written. | ||
32 | * | ||
33 | * We log writes only after they have been flushed, this makes the log describe | ||
34 | * close to the order in which the data hits the actual disk, not its cache. So | ||
35 | * for example the following sequence (W means write, C means complete) | ||
36 | * | ||
37 | * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd | ||
38 | * | ||
39 | * Would result in the log looking like this: | ||
40 | * | ||
41 | * c,a,flush,fuad,b,<other writes>,<next flush> | ||
42 | * | ||
43 | * This is meant to help expose problems where file systems do not properly wait | ||
44 | * on data being written before invoking a FLUSH. FUA bypasses cache so once it | ||
45 | * completes it is added to the log as it should be on disk. | ||
46 | * | ||
47 | * We treat DISCARDs as if they don't bypass cache so that they are logged in | ||
48 | * order of completion along with the normal writes. If we didn't do it this | ||
49 | * way we would process all the discards first and then write all the data, when | ||
50 | * in fact we want to do the data and the discard in the order that they | ||
51 | * completed. | ||
52 | */ | ||
53 | #define LOG_FLUSH_FLAG (1 << 0) | ||
54 | #define LOG_FUA_FLAG (1 << 1) | ||
55 | #define LOG_DISCARD_FLAG (1 << 2) | ||
56 | #define LOG_MARK_FLAG (1 << 3) | ||
57 | |||
58 | #define WRITE_LOG_VERSION 1 | ||
59 | #define WRITE_LOG_MAGIC 0x6a736677736872 | ||
60 | |||
61 | /* | ||
62 | * The disk format for this is braindead simple. | ||
63 | * | ||
64 | * At byte 0 we have our super, followed by the following sequence for | ||
65 | * nr_entries: | ||
66 | * | ||
67 | * [ 1 sector ][ entry->nr_sectors ] | ||
68 | * [log_write_entry][ data written ] | ||
69 | * | ||
70 | * The log_write_entry takes up a full sector so we can have arbitrary length | ||
71 | * marks and it leaves us room for extra content in the future. | ||
72 | */ | ||
73 | |||
74 | /* | ||
75 | * Basic info about the log for userspace. | ||
76 | */ | ||
77 | struct log_write_super { | ||
78 | __le64 magic; | ||
79 | __le64 version; | ||
80 | __le64 nr_entries; | ||
81 | __le32 sectorsize; | ||
82 | }; | ||
83 | |||
84 | /* | ||
85 | * sector - the sector we wrote. | ||
86 | * nr_sectors - the number of sectors we wrote. | ||
87 | * flags - flags for this log entry. | ||
88 | * data_len - the size of the data in this log entry, this is for private log | ||
89 | * entry stuff, the MARK data provided by userspace for example. | ||
90 | */ | ||
91 | struct log_write_entry { | ||
92 | __le64 sector; | ||
93 | __le64 nr_sectors; | ||
94 | __le64 flags; | ||
95 | __le64 data_len; | ||
96 | }; | ||
97 | |||
98 | struct log_writes_c { | ||
99 | struct dm_dev *dev; | ||
100 | struct dm_dev *logdev; | ||
101 | u64 logged_entries; | ||
102 | u32 sectorsize; | ||
103 | atomic_t io_blocks; | ||
104 | atomic_t pending_blocks; | ||
105 | sector_t next_sector; | ||
106 | sector_t end_sector; | ||
107 | bool logging_enabled; | ||
108 | bool device_supports_discard; | ||
109 | spinlock_t blocks_lock; | ||
110 | struct list_head unflushed_blocks; | ||
111 | struct list_head logging_blocks; | ||
112 | wait_queue_head_t wait; | ||
113 | struct task_struct *log_kthread; | ||
114 | }; | ||
115 | |||
116 | struct pending_block { | ||
117 | int vec_cnt; | ||
118 | u64 flags; | ||
119 | sector_t sector; | ||
120 | sector_t nr_sectors; | ||
121 | char *data; | ||
122 | u32 datalen; | ||
123 | struct list_head list; | ||
124 | struct bio_vec vecs[0]; | ||
125 | }; | ||
126 | |||
127 | struct per_bio_data { | ||
128 | struct pending_block *block; | ||
129 | }; | ||
130 | |||
131 | static void put_pending_block(struct log_writes_c *lc) | ||
132 | { | ||
133 | if (atomic_dec_and_test(&lc->pending_blocks)) { | ||
134 | smp_mb__after_atomic(); | ||
135 | if (waitqueue_active(&lc->wait)) | ||
136 | wake_up(&lc->wait); | ||
137 | } | ||
138 | } | ||
139 | |||
140 | static void put_io_block(struct log_writes_c *lc) | ||
141 | { | ||
142 | if (atomic_dec_and_test(&lc->io_blocks)) { | ||
143 | smp_mb__after_atomic(); | ||
144 | if (waitqueue_active(&lc->wait)) | ||
145 | wake_up(&lc->wait); | ||
146 | } | ||
147 | } | ||
148 | |||
149 | static void log_end_io(struct bio *bio, int err) | ||
150 | { | ||
151 | struct log_writes_c *lc = bio->bi_private; | ||
152 | struct bio_vec *bvec; | ||
153 | int i; | ||
154 | |||
155 | if (err) { | ||
156 | unsigned long flags; | ||
157 | |||
158 | DMERR("Error writing log block, error=%d", err); | ||
159 | spin_lock_irqsave(&lc->blocks_lock, flags); | ||
160 | lc->logging_enabled = false; | ||
161 | spin_unlock_irqrestore(&lc->blocks_lock, flags); | ||
162 | } | ||
163 | |||
164 | bio_for_each_segment_all(bvec, bio, i) | ||
165 | __free_page(bvec->bv_page); | ||
166 | |||
167 | put_io_block(lc); | ||
168 | bio_put(bio); | ||
169 | } | ||
170 | |||
171 | /* | ||
172 | * Meant to be called if there is an error, it will free all the pages | ||
173 | * associated with the block. | ||
174 | */ | ||
175 | static void free_pending_block(struct log_writes_c *lc, | ||
176 | struct pending_block *block) | ||
177 | { | ||
178 | int i; | ||
179 | |||
180 | for (i = 0; i < block->vec_cnt; i++) { | ||
181 | if (block->vecs[i].bv_page) | ||
182 | __free_page(block->vecs[i].bv_page); | ||
183 | } | ||
184 | kfree(block->data); | ||
185 | kfree(block); | ||
186 | put_pending_block(lc); | ||
187 | } | ||
188 | |||
189 | static int write_metadata(struct log_writes_c *lc, void *entry, | ||
190 | size_t entrylen, void *data, size_t datalen, | ||
191 | sector_t sector) | ||
192 | { | ||
193 | struct bio *bio; | ||
194 | struct page *page; | ||
195 | void *ptr; | ||
196 | size_t ret; | ||
197 | |||
198 | bio = bio_alloc(GFP_KERNEL, 1); | ||
199 | if (!bio) { | ||
200 | DMERR("Couldn't alloc log bio"); | ||
201 | goto error; | ||
202 | } | ||
203 | bio->bi_iter.bi_size = 0; | ||
204 | bio->bi_iter.bi_sector = sector; | ||
205 | bio->bi_bdev = lc->logdev->bdev; | ||
206 | bio->bi_end_io = log_end_io; | ||
207 | bio->bi_private = lc; | ||
208 | set_bit(BIO_UPTODATE, &bio->bi_flags); | ||
209 | |||
210 | page = alloc_page(GFP_KERNEL); | ||
211 | if (!page) { | ||
212 | DMERR("Couldn't alloc log page"); | ||
213 | bio_put(bio); | ||
214 | goto error; | ||
215 | } | ||
216 | |||
217 | ptr = kmap_atomic(page); | ||
218 | memcpy(ptr, entry, entrylen); | ||
219 | if (datalen) | ||
220 | memcpy(ptr + entrylen, data, datalen); | ||
221 | memset(ptr + entrylen + datalen, 0, | ||
222 | lc->sectorsize - entrylen - datalen); | ||
223 | kunmap_atomic(ptr); | ||
224 | |||
225 | ret = bio_add_page(bio, page, lc->sectorsize, 0); | ||
226 | if (ret != lc->sectorsize) { | ||
227 | DMERR("Couldn't add page to the log block"); | ||
228 | goto error_bio; | ||
229 | } | ||
230 | submit_bio(WRITE, bio); | ||
231 | return 0; | ||
232 | error_bio: | ||
233 | bio_put(bio); | ||
234 | __free_page(page); | ||
235 | error: | ||
236 | put_io_block(lc); | ||
237 | return -1; | ||
238 | } | ||
239 | |||
240 | static int log_one_block(struct log_writes_c *lc, | ||
241 | struct pending_block *block, sector_t sector) | ||
242 | { | ||
243 | struct bio *bio; | ||
244 | struct log_write_entry entry; | ||
245 | size_t ret; | ||
246 | int i; | ||
247 | |||
248 | entry.sector = cpu_to_le64(block->sector); | ||
249 | entry.nr_sectors = cpu_to_le64(block->nr_sectors); | ||
250 | entry.flags = cpu_to_le64(block->flags); | ||
251 | entry.data_len = cpu_to_le64(block->datalen); | ||
252 | if (write_metadata(lc, &entry, sizeof(entry), block->data, | ||
253 | block->datalen, sector)) { | ||
254 | free_pending_block(lc, block); | ||
255 | return -1; | ||
256 | } | ||
257 | |||
258 | if (!block->vec_cnt) | ||
259 | goto out; | ||
260 | sector++; | ||
261 | |||
262 | bio = bio_alloc(GFP_KERNEL, block->vec_cnt); | ||
263 | if (!bio) { | ||
264 | DMERR("Couldn't alloc log bio"); | ||
265 | goto error; | ||
266 | } | ||
267 | atomic_inc(&lc->io_blocks); | ||
268 | bio->bi_iter.bi_size = 0; | ||
269 | bio->bi_iter.bi_sector = sector; | ||
270 | bio->bi_bdev = lc->logdev->bdev; | ||
271 | bio->bi_end_io = log_end_io; | ||
272 | bio->bi_private = lc; | ||
273 | set_bit(BIO_UPTODATE, &bio->bi_flags); | ||
274 | |||
275 | for (i = 0; i < block->vec_cnt; i++) { | ||
276 | /* | ||
277 | * The page offset is always 0 because we allocate a new page | ||
278 | * for every bvec in the original bio for simplicity sake. | ||
279 | */ | ||
280 | ret = bio_add_page(bio, block->vecs[i].bv_page, | ||
281 | block->vecs[i].bv_len, 0); | ||
282 | if (ret != block->vecs[i].bv_len) { | ||
283 | atomic_inc(&lc->io_blocks); | ||
284 | submit_bio(WRITE, bio); | ||
285 | bio = bio_alloc(GFP_KERNEL, block->vec_cnt - i); | ||
286 | if (!bio) { | ||
287 | DMERR("Couldn't alloc log bio"); | ||
288 | goto error; | ||
289 | } | ||
290 | bio->bi_iter.bi_size = 0; | ||
291 | bio->bi_iter.bi_sector = sector; | ||
292 | bio->bi_bdev = lc->logdev->bdev; | ||
293 | bio->bi_end_io = log_end_io; | ||
294 | bio->bi_private = lc; | ||
295 | set_bit(BIO_UPTODATE, &bio->bi_flags); | ||
296 | |||
297 | ret = bio_add_page(bio, block->vecs[i].bv_page, | ||
298 | block->vecs[i].bv_len, 0); | ||
299 | if (ret != block->vecs[i].bv_len) { | ||
300 | DMERR("Couldn't add page on new bio?"); | ||
301 | bio_put(bio); | ||
302 | goto error; | ||
303 | } | ||
304 | } | ||
305 | sector += block->vecs[i].bv_len >> SECTOR_SHIFT; | ||
306 | } | ||
307 | submit_bio(WRITE, bio); | ||
308 | out: | ||
309 | kfree(block->data); | ||
310 | kfree(block); | ||
311 | put_pending_block(lc); | ||
312 | return 0; | ||
313 | error: | ||
314 | free_pending_block(lc, block); | ||
315 | put_io_block(lc); | ||
316 | return -1; | ||
317 | } | ||
318 | |||
319 | static int log_super(struct log_writes_c *lc) | ||
320 | { | ||
321 | struct log_write_super super; | ||
322 | |||
323 | super.magic = cpu_to_le64(WRITE_LOG_MAGIC); | ||
324 | super.version = cpu_to_le64(WRITE_LOG_VERSION); | ||
325 | super.nr_entries = cpu_to_le64(lc->logged_entries); | ||
326 | super.sectorsize = cpu_to_le32(lc->sectorsize); | ||
327 | |||
328 | if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) { | ||
329 | DMERR("Couldn't write super"); | ||
330 | return -1; | ||
331 | } | ||
332 | |||
333 | return 0; | ||
334 | } | ||
335 | |||
336 | static inline sector_t logdev_last_sector(struct log_writes_c *lc) | ||
337 | { | ||
338 | return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT; | ||
339 | } | ||
340 | |||
341 | static int log_writes_kthread(void *arg) | ||
342 | { | ||
343 | struct log_writes_c *lc = (struct log_writes_c *)arg; | ||
344 | sector_t sector = 0; | ||
345 | |||
346 | while (!kthread_should_stop()) { | ||
347 | bool super = false; | ||
348 | bool logging_enabled; | ||
349 | struct pending_block *block = NULL; | ||
350 | int ret; | ||
351 | |||
352 | spin_lock_irq(&lc->blocks_lock); | ||
353 | if (!list_empty(&lc->logging_blocks)) { | ||
354 | block = list_first_entry(&lc->logging_blocks, | ||
355 | struct pending_block, list); | ||
356 | list_del_init(&block->list); | ||
357 | if (!lc->logging_enabled) | ||
358 | goto next; | ||
359 | |||
360 | sector = lc->next_sector; | ||
361 | if (block->flags & LOG_DISCARD_FLAG) | ||
362 | lc->next_sector++; | ||
363 | else | ||
364 | lc->next_sector += block->nr_sectors + 1; | ||
365 | |||
366 | /* | ||
367 | * Apparently the size of the device may not be known | ||
368 | * right away, so handle this properly. | ||
369 | */ | ||
370 | if (!lc->end_sector) | ||
371 | lc->end_sector = logdev_last_sector(lc); | ||
372 | if (lc->end_sector && | ||
373 | lc->next_sector >= lc->end_sector) { | ||
374 | DMERR("Ran out of space on the logdev"); | ||
375 | lc->logging_enabled = false; | ||
376 | goto next; | ||
377 | } | ||
378 | lc->logged_entries++; | ||
379 | atomic_inc(&lc->io_blocks); | ||
380 | |||
381 | super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG)); | ||
382 | if (super) | ||
383 | atomic_inc(&lc->io_blocks); | ||
384 | } | ||
385 | next: | ||
386 | logging_enabled = lc->logging_enabled; | ||
387 | spin_unlock_irq(&lc->blocks_lock); | ||
388 | if (block) { | ||
389 | if (logging_enabled) { | ||
390 | ret = log_one_block(lc, block, sector); | ||
391 | if (!ret && super) | ||
392 | ret = log_super(lc); | ||
393 | if (ret) { | ||
394 | spin_lock_irq(&lc->blocks_lock); | ||
395 | lc->logging_enabled = false; | ||
396 | spin_unlock_irq(&lc->blocks_lock); | ||
397 | } | ||
398 | } else | ||
399 | free_pending_block(lc, block); | ||
400 | continue; | ||
401 | } | ||
402 | |||
403 | if (!try_to_freeze()) { | ||
404 | set_current_state(TASK_INTERRUPTIBLE); | ||
405 | if (!kthread_should_stop() && | ||
406 | !atomic_read(&lc->pending_blocks)) | ||
407 | schedule(); | ||
408 | __set_current_state(TASK_RUNNING); | ||
409 | } | ||
410 | } | ||
411 | return 0; | ||
412 | } | ||
413 | |||
414 | /* | ||
415 | * Construct a log-writes mapping: | ||
416 | * log-writes <dev_path> <log_dev_path> | ||
417 | */ | ||
418 | static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) | ||
419 | { | ||
420 | struct log_writes_c *lc; | ||
421 | struct dm_arg_set as; | ||
422 | const char *devname, *logdevname; | ||
423 | |||
424 | as.argc = argc; | ||
425 | as.argv = argv; | ||
426 | |||
427 | if (argc < 2) { | ||
428 | ti->error = "Invalid argument count"; | ||
429 | return -EINVAL; | ||
430 | } | ||
431 | |||
432 | lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL); | ||
433 | if (!lc) { | ||
434 | ti->error = "Cannot allocate context"; | ||
435 | return -ENOMEM; | ||
436 | } | ||
437 | spin_lock_init(&lc->blocks_lock); | ||
438 | INIT_LIST_HEAD(&lc->unflushed_blocks); | ||
439 | INIT_LIST_HEAD(&lc->logging_blocks); | ||
440 | init_waitqueue_head(&lc->wait); | ||
441 | lc->sectorsize = 1 << SECTOR_SHIFT; | ||
442 | atomic_set(&lc->io_blocks, 0); | ||
443 | atomic_set(&lc->pending_blocks, 0); | ||
444 | |||
445 | devname = dm_shift_arg(&as); | ||
446 | if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev)) { | ||
447 | ti->error = "Device lookup failed"; | ||
448 | goto bad; | ||
449 | } | ||
450 | |||
451 | logdevname = dm_shift_arg(&as); | ||
452 | if (dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), &lc->logdev)) { | ||
453 | ti->error = "Log device lookup failed"; | ||
454 | dm_put_device(ti, lc->dev); | ||
455 | goto bad; | ||
456 | } | ||
457 | |||
458 | lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write"); | ||
459 | if (!lc->log_kthread) { | ||
460 | ti->error = "Couldn't alloc kthread"; | ||
461 | dm_put_device(ti, lc->dev); | ||
462 | dm_put_device(ti, lc->logdev); | ||
463 | goto bad; | ||
464 | } | ||
465 | |||
466 | /* We put the super at sector 0, start logging at sector 1 */ | ||
467 | lc->next_sector = 1; | ||
468 | lc->logging_enabled = true; | ||
469 | lc->end_sector = logdev_last_sector(lc); | ||
470 | lc->device_supports_discard = true; | ||
471 | |||
472 | ti->num_flush_bios = 1; | ||
473 | ti->flush_supported = true; | ||
474 | ti->num_discard_bios = 1; | ||
475 | ti->discards_supported = true; | ||
476 | ti->per_bio_data_size = sizeof(struct per_bio_data); | ||
477 | ti->private = lc; | ||
478 | return 0; | ||
479 | |||
480 | bad: | ||
481 | kfree(lc); | ||
482 | return -EINVAL; | ||
483 | } | ||
484 | |||
485 | static int log_mark(struct log_writes_c *lc, char *data) | ||
486 | { | ||
487 | struct pending_block *block; | ||
488 | size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry); | ||
489 | |||
490 | block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); | ||
491 | if (!block) { | ||
492 | DMERR("Error allocating pending block"); | ||
493 | return -ENOMEM; | ||
494 | } | ||
495 | |||
496 | block->data = kstrndup(data, maxsize, GFP_KERNEL); | ||
497 | if (!block->data) { | ||
498 | DMERR("Error copying mark data"); | ||
499 | kfree(block); | ||
500 | return -ENOMEM; | ||
501 | } | ||
502 | atomic_inc(&lc->pending_blocks); | ||
503 | block->datalen = strlen(block->data); | ||
504 | block->flags |= LOG_MARK_FLAG; | ||
505 | spin_lock_irq(&lc->blocks_lock); | ||
506 | list_add_tail(&block->list, &lc->logging_blocks); | ||
507 | spin_unlock_irq(&lc->blocks_lock); | ||
508 | wake_up_process(lc->log_kthread); | ||
509 | return 0; | ||
510 | } | ||
511 | |||
512 | static void log_writes_dtr(struct dm_target *ti) | ||
513 | { | ||
514 | struct log_writes_c *lc = ti->private; | ||
515 | |||
516 | spin_lock_irq(&lc->blocks_lock); | ||
517 | list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks); | ||
518 | spin_unlock_irq(&lc->blocks_lock); | ||
519 | |||
520 | /* | ||
521 | * This is just nice to have since it'll update the super to include the | ||
522 | * unflushed blocks, if it fails we don't really care. | ||
523 | */ | ||
524 | log_mark(lc, "dm-log-writes-end"); | ||
525 | wake_up_process(lc->log_kthread); | ||
526 | wait_event(lc->wait, !atomic_read(&lc->io_blocks) && | ||
527 | !atomic_read(&lc->pending_blocks)); | ||
528 | kthread_stop(lc->log_kthread); | ||
529 | |||
530 | WARN_ON(!list_empty(&lc->logging_blocks)); | ||
531 | WARN_ON(!list_empty(&lc->unflushed_blocks)); | ||
532 | dm_put_device(ti, lc->dev); | ||
533 | dm_put_device(ti, lc->logdev); | ||
534 | kfree(lc); | ||
535 | } | ||
536 | |||
537 | static void normal_map_bio(struct dm_target *ti, struct bio *bio) | ||
538 | { | ||
539 | struct log_writes_c *lc = ti->private; | ||
540 | |||
541 | bio->bi_bdev = lc->dev->bdev; | ||
542 | } | ||
543 | |||
544 | static int log_writes_map(struct dm_target *ti, struct bio *bio) | ||
545 | { | ||
546 | struct log_writes_c *lc = ti->private; | ||
547 | struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); | ||
548 | struct pending_block *block; | ||
549 | struct bvec_iter iter; | ||
550 | struct bio_vec bv; | ||
551 | size_t alloc_size; | ||
552 | int i = 0; | ||
553 | bool flush_bio = (bio->bi_rw & REQ_FLUSH); | ||
554 | bool fua_bio = (bio->bi_rw & REQ_FUA); | ||
555 | bool discard_bio = (bio->bi_rw & REQ_DISCARD); | ||
556 | |||
557 | pb->block = NULL; | ||
558 | |||
559 | /* Don't bother doing anything if logging has been disabled */ | ||
560 | if (!lc->logging_enabled) | ||
561 | goto map_bio; | ||
562 | |||
563 | /* | ||
564 | * Map reads as normal. | ||
565 | */ | ||
566 | if (bio_data_dir(bio) == READ) | ||
567 | goto map_bio; | ||
568 | |||
569 | /* No sectors and not a flush? Don't care */ | ||
570 | if (!bio_sectors(bio) && !flush_bio) | ||
571 | goto map_bio; | ||
572 | |||
573 | /* | ||
574 | * Discards will have bi_size set but there's no actual data, so just | ||
575 | * allocate the size of the pending block. | ||
576 | */ | ||
577 | if (discard_bio) | ||
578 | alloc_size = sizeof(struct pending_block); | ||
579 | else | ||
580 | alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio); | ||
581 | |||
582 | block = kzalloc(alloc_size, GFP_NOIO); | ||
583 | if (!block) { | ||
584 | DMERR("Error allocating pending block"); | ||
585 | spin_lock_irq(&lc->blocks_lock); | ||
586 | lc->logging_enabled = false; | ||
587 | spin_unlock_irq(&lc->blocks_lock); | ||
588 | return -ENOMEM; | ||
589 | } | ||
590 | INIT_LIST_HEAD(&block->list); | ||
591 | pb->block = block; | ||
592 | atomic_inc(&lc->pending_blocks); | ||
593 | |||
594 | if (flush_bio) | ||
595 | block->flags |= LOG_FLUSH_FLAG; | ||
596 | if (fua_bio) | ||
597 | block->flags |= LOG_FUA_FLAG; | ||
598 | if (discard_bio) | ||
599 | block->flags |= LOG_DISCARD_FLAG; | ||
600 | |||
601 | block->sector = bio->bi_iter.bi_sector; | ||
602 | block->nr_sectors = bio_sectors(bio); | ||
603 | |||
604 | /* We don't need the data, just submit */ | ||
605 | if (discard_bio) { | ||
606 | WARN_ON(flush_bio || fua_bio); | ||
607 | if (lc->device_supports_discard) | ||
608 | goto map_bio; | ||
609 | bio_endio(bio, 0); | ||
610 | return DM_MAPIO_SUBMITTED; | ||
611 | } | ||
612 | |||
613 | /* Flush bio, splice the unflushed blocks onto this list and submit */ | ||
614 | if (flush_bio && !bio_sectors(bio)) { | ||
615 | spin_lock_irq(&lc->blocks_lock); | ||
616 | list_splice_init(&lc->unflushed_blocks, &block->list); | ||
617 | spin_unlock_irq(&lc->blocks_lock); | ||
618 | goto map_bio; | ||
619 | } | ||
620 | |||
621 | /* | ||
622 | * We will write this bio somewhere else way later so we need to copy | ||
623 | * the actual contents into new pages so we know the data will always be | ||
624 | * there. | ||
625 | * | ||
626 | * We do this because this could be a bio from O_DIRECT in which case we | ||
627 | * can't just hold onto the page until some later point, we have to | ||
628 | * manually copy the contents. | ||
629 | */ | ||
630 | bio_for_each_segment(bv, bio, iter) { | ||
631 | struct page *page; | ||
632 | void *src, *dst; | ||
633 | |||
634 | page = alloc_page(GFP_NOIO); | ||
635 | if (!page) { | ||
636 | DMERR("Error allocing page"); | ||
637 | free_pending_block(lc, block); | ||
638 | spin_lock_irq(&lc->blocks_lock); | ||
639 | lc->logging_enabled = false; | ||
640 | spin_unlock_irq(&lc->blocks_lock); | ||
641 | return -ENOMEM; | ||
642 | } | ||
643 | |||
644 | src = kmap_atomic(bv.bv_page); | ||
645 | dst = kmap_atomic(page); | ||
646 | memcpy(dst, src + bv.bv_offset, bv.bv_len); | ||
647 | kunmap_atomic(dst); | ||
648 | kunmap_atomic(src); | ||
649 | block->vecs[i].bv_page = page; | ||
650 | block->vecs[i].bv_len = bv.bv_len; | ||
651 | block->vec_cnt++; | ||
652 | i++; | ||
653 | } | ||
654 | |||
655 | /* Had a flush with data in it, weird */ | ||
656 | if (flush_bio) { | ||
657 | spin_lock_irq(&lc->blocks_lock); | ||
658 | list_splice_init(&lc->unflushed_blocks, &block->list); | ||
659 | spin_unlock_irq(&lc->blocks_lock); | ||
660 | } | ||
661 | map_bio: | ||
662 | normal_map_bio(ti, bio); | ||
663 | return DM_MAPIO_REMAPPED; | ||
664 | } | ||
665 | |||
666 | static int normal_end_io(struct dm_target *ti, struct bio *bio, int error) | ||
667 | { | ||
668 | struct log_writes_c *lc = ti->private; | ||
669 | struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); | ||
670 | |||
671 | if (bio_data_dir(bio) == WRITE && pb->block) { | ||
672 | struct pending_block *block = pb->block; | ||
673 | unsigned long flags; | ||
674 | |||
675 | spin_lock_irqsave(&lc->blocks_lock, flags); | ||
676 | if (block->flags & LOG_FLUSH_FLAG) { | ||
677 | list_splice_tail_init(&block->list, &lc->logging_blocks); | ||
678 | list_add_tail(&block->list, &lc->logging_blocks); | ||
679 | wake_up_process(lc->log_kthread); | ||
680 | } else if (block->flags & LOG_FUA_FLAG) { | ||
681 | list_add_tail(&block->list, &lc->logging_blocks); | ||
682 | wake_up_process(lc->log_kthread); | ||
683 | } else | ||
684 | list_add_tail(&block->list, &lc->unflushed_blocks); | ||
685 | spin_unlock_irqrestore(&lc->blocks_lock, flags); | ||
686 | } | ||
687 | |||
688 | return error; | ||
689 | } | ||
690 | |||
691 | /* | ||
692 | * INFO format: <logged entries> <highest allocated sector> | ||
693 | */ | ||
694 | static void log_writes_status(struct dm_target *ti, status_type_t type, | ||
695 | unsigned status_flags, char *result, | ||
696 | unsigned maxlen) | ||
697 | { | ||
698 | unsigned sz = 0; | ||
699 | struct log_writes_c *lc = ti->private; | ||
700 | |||
701 | switch (type) { | ||
702 | case STATUSTYPE_INFO: | ||
703 | DMEMIT("%llu %llu", lc->logged_entries, | ||
704 | (unsigned long long)lc->next_sector - 1); | ||
705 | if (!lc->logging_enabled) | ||
706 | DMEMIT(" logging_disabled"); | ||
707 | break; | ||
708 | |||
709 | case STATUSTYPE_TABLE: | ||
710 | DMEMIT("%s %s", lc->dev->name, lc->logdev->name); | ||
711 | break; | ||
712 | } | ||
713 | } | ||
714 | |||
715 | static int log_writes_ioctl(struct dm_target *ti, unsigned int cmd, | ||
716 | unsigned long arg) | ||
717 | { | ||
718 | struct log_writes_c *lc = ti->private; | ||
719 | struct dm_dev *dev = lc->dev; | ||
720 | int r = 0; | ||
721 | |||
722 | /* | ||
723 | * Only pass ioctls through if the device sizes match exactly. | ||
724 | */ | ||
725 | if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) | ||
726 | r = scsi_verify_blk_ioctl(NULL, cmd); | ||
727 | |||
728 | return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg); | ||
729 | } | ||
730 | |||
731 | static int log_writes_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | ||
732 | struct bio_vec *biovec, int max_size) | ||
733 | { | ||
734 | struct log_writes_c *lc = ti->private; | ||
735 | struct request_queue *q = bdev_get_queue(lc->dev->bdev); | ||
736 | |||
737 | if (!q->merge_bvec_fn) | ||
738 | return max_size; | ||
739 | |||
740 | bvm->bi_bdev = lc->dev->bdev; | ||
741 | bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector); | ||
742 | |||
743 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | ||
744 | } | ||
745 | |||
746 | static int log_writes_iterate_devices(struct dm_target *ti, | ||
747 | iterate_devices_callout_fn fn, | ||
748 | void *data) | ||
749 | { | ||
750 | struct log_writes_c *lc = ti->private; | ||
751 | |||
752 | return fn(ti, lc->dev, 0, ti->len, data); | ||
753 | } | ||
754 | |||
755 | /* | ||
756 | * Messages supported: | ||
757 | * mark <mark data> - specify the marked data. | ||
758 | */ | ||
759 | static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv) | ||
760 | { | ||
761 | int r = -EINVAL; | ||
762 | struct log_writes_c *lc = ti->private; | ||
763 | |||
764 | if (argc != 2) { | ||
765 | DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc); | ||
766 | return r; | ||
767 | } | ||
768 | |||
769 | if (!strcasecmp(argv[0], "mark")) | ||
770 | r = log_mark(lc, argv[1]); | ||
771 | else | ||
772 | DMWARN("Unrecognised log writes target message received: %s", argv[0]); | ||
773 | |||
774 | return r; | ||
775 | } | ||
776 | |||
777 | static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits) | ||
778 | { | ||
779 | struct log_writes_c *lc = ti->private; | ||
780 | struct request_queue *q = bdev_get_queue(lc->dev->bdev); | ||
781 | |||
782 | if (!q || !blk_queue_discard(q)) { | ||
783 | lc->device_supports_discard = false; | ||
784 | limits->discard_granularity = 1 << SECTOR_SHIFT; | ||
785 | limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT); | ||
786 | } | ||
787 | } | ||
788 | |||
789 | static struct target_type log_writes_target = { | ||
790 | .name = "log-writes", | ||
791 | .version = {1, 0, 0}, | ||
792 | .module = THIS_MODULE, | ||
793 | .ctr = log_writes_ctr, | ||
794 | .dtr = log_writes_dtr, | ||
795 | .map = log_writes_map, | ||
796 | .end_io = normal_end_io, | ||
797 | .status = log_writes_status, | ||
798 | .ioctl = log_writes_ioctl, | ||
799 | .merge = log_writes_merge, | ||
800 | .message = log_writes_message, | ||
801 | .iterate_devices = log_writes_iterate_devices, | ||
802 | .io_hints = log_writes_io_hints, | ||
803 | }; | ||
804 | |||
805 | static int __init dm_log_writes_init(void) | ||
806 | { | ||
807 | int r = dm_register_target(&log_writes_target); | ||
808 | |||
809 | if (r < 0) | ||
810 | DMERR("register failed %d", r); | ||
811 | |||
812 | return r; | ||
813 | } | ||
814 | |||
815 | static void __exit dm_log_writes_exit(void) | ||
816 | { | ||
817 | dm_unregister_target(&log_writes_target); | ||
818 | } | ||
819 | |||
820 | module_init(dm_log_writes_init); | ||
821 | module_exit(dm_log_writes_exit); | ||
822 | |||
823 | MODULE_DESCRIPTION(DM_NAME " log writes target"); | ||
824 | MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>"); | ||
825 | MODULE_LICENSE("GPL"); | ||