aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoe Thornber <thornber@redhat.com>2011-10-31 16:21:18 -0400
committerAlasdair G Kergon <agk@redhat.com>2011-10-31 16:21:18 -0400
commit991d9fa02da0dd1f843dc011376965e0c8c6c9b5 (patch)
treea64c94710246b77bb74cd77634581cea3d32cfe1
parent3241b1d3e0aaafbfcd320f4d71ade629728cc4f4 (diff)
dm: add thin provisioning target
Initial EXPERIMENTAL implementation of device-mapper thin provisioning with snapshot support. The 'thin' target is used to create instances of the virtual devices that are hosted in the 'thin-pool' target. The thin-pool target provides data sharing among devices. This sharing is made possible using the persistent-data library in the previous patch. The main highlight of this implementation, compared to the previous implementation of snapshots, is that it allows many virtual devices to be stored on the same data volume, simplifying administration and allowing sharing of data between volumes (thus reducing disk usage). Another big feature is support for arbitrary depth of recursive snapshots (snapshots of snapshots of snapshots ...). The previous implementation of snapshots did this by chaining together lookup tables, and so performance was O(depth). This new implementation uses a single data structure so we don't get this degradation with depth. For further information and examples of how to use this, please read Documentation/device-mapper/thin-provisioning.txt Signed-off-by: Joe Thornber <thornber@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
-rw-r--r--Documentation/device-mapper/thin-provisioning.txt285
-rw-r--r--drivers/md/Kconfig28
-rw-r--r--drivers/md/Makefile3
-rw-r--r--drivers/md/dm-thin-metadata.c1391
-rw-r--r--drivers/md/dm-thin-metadata.h156
-rw-r--r--drivers/md/dm-thin.c2428
6 files changed, 4291 insertions, 0 deletions
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
new file mode 100644
index 00000000000..801d9d1cf82
--- /dev/null
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -0,0 +1,285 @@
1Introduction
2============
3
4This document descibes a collection of device-mapper targets that
5between them implement thin-provisioning and snapshots.
6
7The main highlight of this implementation, compared to the previous
8implementation of snapshots, is that it allows many virtual devices to
9be stored on the same data volume. This simplifies administration and
10allows the sharing of data between volumes, thus reducing disk usage.
11
12Another significant feature is support for an arbitrary depth of
13recursive snapshots (snapshots of snapshots of snapshots ...). The
14previous implementation of snapshots did this by chaining together
15lookup tables, and so performance was O(depth). This new
16implementation uses a single data structure to avoid this degradation
17with depth. Fragmentation may still be an issue, however, in some
18scenarios.
19
20Metadata is stored on a separate device from data, giving the
21administrator some freedom, for example to:
22
23- Improve metadata resilience by storing metadata on a mirrored volume
24 but data on a non-mirrored one.
25
26- Improve performance by storing the metadata on SSD.
27
28Status
29======
30
31These targets are very much still in the EXPERIMENTAL state. Please
32do not yet rely on them in production. But do experiment and offer us
33feedback. Different use cases will have different performance
34characteristics, for example due to fragmentation of the data volume.
35
36If you find this software is not performing as expected please mail
37dm-devel@redhat.com with details and we'll try our best to improve
38things for you.
39
40Userspace tools for checking and repairing the metadata are under
41development.
42
43Cookbook
44========
45
46This section describes some quick recipes for using thin provisioning.
47They use the dmsetup program to control the device-mapper driver
48directly. End users will be advised to use a higher-level volume
49manager such as LVM2 once support has been added.
50
51Pool device
52-----------
53
54The pool device ties together the metadata volume and the data volume.
55It maps I/O linearly to the data volume and updates the metadata via
56two mechanisms:
57
58- Function calls from the thin targets
59
60- Device-mapper 'messages' from userspace which control the creation of new
61 virtual devices amongst other things.
62
63Setting up a fresh pool device
64------------------------------
65
66Setting up a pool device requires a valid metadata device, and a
67data device. If you do not have an existing metadata device you can
68make one by zeroing the first 4k to indicate empty metadata.
69
70 dd if=/dev/zero of=$metadata_dev bs=4096 count=1
71
72The amount of metadata you need will vary according to how many blocks
73are shared between thin devices (i.e. through snapshots). If you have
74less sharing than average you'll need a larger-than-average metadata device.
75
76As a guide, we suggest you calculate the number of bytes to use in the
77metadata device as 48 * $data_dev_size / $data_block_size but round it up
78to 2MB if the answer is smaller. The largest size supported is 16GB.
79
80If you're creating large numbers of snapshots which are recording large
81amounts of change, you may need find you need to increase this.
82
83Reloading a pool table
84----------------------
85
86You may reload a pool's table, indeed this is how the pool is resized
87if it runs out of space. (N.B. While specifying a different metadata
88device when reloading is not forbidden at the moment, things will go
89wrong if it does not route I/O to exactly the same on-disk location as
90previously.)
91
92Using an existing pool device
93-----------------------------
94
95 dmsetup create pool \
96 --table "0 20971520 thin-pool $metadata_dev $data_dev \
97 $data_block_size $low_water_mark"
98
99$data_block_size gives the smallest unit of disk space that can be
100allocated at a time expressed in units of 512-byte sectors. People
101primarily interested in thin provisioning may want to use a value such
102as 1024 (512KB). People doing lots of snapshotting may want a smaller value
103such as 128 (64KB). If you are not zeroing newly-allocated data,
104a larger $data_block_size in the region of 256000 (128MB) is suggested.
105$data_block_size must be the same for the lifetime of the
106metadata device.
107
108$low_water_mark is expressed in blocks of size $data_block_size. If
109free space on the data device drops below this level then a dm event
110will be triggered which a userspace daemon should catch allowing it to
111extend the pool device. Only one such event will be sent.
112Resuming a device with a new table itself triggers an event so the
113userspace daemon can use this to detect a situation where a new table
114already exceeds the threshold.
115
116Thin provisioning
117-----------------
118
119i) Creating a new thinly-provisioned volume.
120
121 To create a new thinly- provisioned volume you must send a message to an
122 active pool device, /dev/mapper/pool in this example.
123
124 dmsetup message /dev/mapper/pool 0 "create_thin 0"
125
126 Here '0' is an identifier for the volume, a 24-bit number. It's up
127 to the caller to allocate and manage these identifiers. If the
128 identifier is already in use, the message will fail with -EEXIST.
129
130ii) Using a thinly-provisioned volume.
131
132 Thinly-provisioned volumes are activated using the 'thin' target:
133
134 dmsetup create thin --table "0 2097152 thin /dev/mapper/pool 0"
135
136 The last parameter is the identifier for the thinp device.
137
138Internal snapshots
139------------------
140
141i) Creating an internal snapshot.
142
143 Snapshots are created with another message to the pool.
144
145 N.B. If the origin device that you wish to snapshot is active, you
146 must suspend it before creating the snapshot to avoid corruption.
147 This is NOT enforced at the moment, so please be careful!
148
149 dmsetup suspend /dev/mapper/thin
150 dmsetup message /dev/mapper/pool 0 "create_snap 1 0"
151 dmsetup resume /dev/mapper/thin
152
153 Here '1' is the identifier for the volume, a 24-bit number. '0' is the
154 identifier for the origin device.
155
156ii) Using an internal snapshot.
157
158 Once created, the user doesn't have to worry about any connection
159 between the origin and the snapshot. Indeed the snapshot is no
160 different from any other thinly-provisioned device and can be
161 snapshotted itself via the same method. It's perfectly legal to
162 have only one of them active, and there's no ordering requirement on
163 activating or removing them both. (This differs from conventional
164 device-mapper snapshots.)
165
166 Activate it exactly the same way as any other thinly-provisioned volume:
167
168 dmsetup create snap --table "0 2097152 thin /dev/mapper/pool 1"
169
170Deactivation
171------------
172
173All devices using a pool must be deactivated before the pool itself
174can be.
175
176 dmsetup remove thin
177 dmsetup remove snap
178 dmsetup remove pool
179
180Reference
181=========
182
183'thin-pool' target
184------------------
185
186i) Constructor
187
188 thin-pool <metadata dev> <data dev> <data block size (sectors)> \
189 <low water mark (blocks)> [<number of feature args> [<arg>]*]
190
191 Optional feature arguments:
192 - 'skip_block_zeroing': skips the zeroing of newly-provisioned blocks.
193
194 Data block size must be between 64KB (128 sectors) and 1GB
195 (2097152 sectors) inclusive.
196
197
198ii) Status
199
200 <transaction id> <used metadata blocks>/<total metadata blocks>
201 <used data blocks>/<total data blocks> <held metadata root>
202
203
204 transaction id:
205 A 64-bit number used by userspace to help synchronise with metadata
206 from volume managers.
207
208 used data blocks / total data blocks
209 If the number of free blocks drops below the pool's low water mark a
210 dm event will be sent to userspace. This event is edge-triggered and
211 it will occur only once after each resume so volume manager writers
212 should register for the event and then check the target's status.
213
214 held metadata root:
215 The location, in sectors, of the metadata root that has been
216 'held' for userspace read access. '-' indicates there is no
217 held root. This feature is not yet implemented so '-' is
218 always returned.
219
220iii) Messages
221
222 create_thin <dev id>
223
224 Create a new thinly-provisioned device.
225 <dev id> is an arbitrary unique 24-bit identifier chosen by
226 the caller.
227
228 create_snap <dev id> <origin id>
229
230 Create a new snapshot of another thinly-provisioned device.
231 <dev id> is an arbitrary unique 24-bit identifier chosen by
232 the caller.
233 <origin id> is the identifier of the thinly-provisioned device
234 of which the new device will be a snapshot.
235
236 delete <dev id>
237
238 Deletes a thin device. Irreversible.
239
240 trim <dev id> <new size in sectors>
241
242 Delete mappings from the end of a thin device. Irreversible.
243 You might want to use this if you're reducing the size of
244 your thinly-provisioned device. In many cases, due to the
245 sharing of blocks between devices, it is not possible to
246 determine in advance how much space 'trim' will release. (In
247 future a userspace tool might be able to perform this
248 calculation.)
249
250 set_transaction_id <current id> <new id>
251
252 Userland volume managers, such as LVM, need a way to
253 synchronise their external metadata with the internal metadata of the
254 pool target. The thin-pool target offers to store an
255 arbitrary 64-bit transaction id and return it on the target's
256 status line. To avoid races you must provide what you think
257 the current transaction id is when you change it with this
258 compare-and-swap message.
259
260'thin' target
261-------------
262
263i) Constructor
264
265 thin <pool dev> <dev id>
266
267 pool dev:
268 the thin-pool device, e.g. /dev/mapper/my_pool or 253:0
269
270 dev id:
271 the internal device identifier of the device to be
272 activated.
273
274The pool doesn't store any size against the thin devices. If you
275load a thin target that is smaller than you've been using previously,
276then you'll have no access to blocks mapped beyond the end. If you
277load a target that is bigger than before, then extra blocks will be
278provisioned as and when needed.
279
280If you wish to reduce the size of your thin device and potentially
281regain some space then send the 'trim' message to the pool.
282
283ii) Status
284
285 <nr mapped sectors> <highest mapped sector>
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b1a92149704..faa4741df6d 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -216,6 +216,8 @@ config DM_BUFIO
216 as a cache, holding recently-read blocks in memory and performing 216 as a cache, holding recently-read blocks in memory and performing
217 delayed writes. 217 delayed writes.
218 218
219source "drivers/md/persistent-data/Kconfig"
220
219config DM_CRYPT 221config DM_CRYPT
220 tristate "Crypt target support" 222 tristate "Crypt target support"
221 depends on BLK_DEV_DM 223 depends on BLK_DEV_DM
@@ -241,6 +243,32 @@ config DM_SNAPSHOT
241 ---help--- 243 ---help---
242 Allow volume managers to take writable snapshots of a device. 244 Allow volume managers to take writable snapshots of a device.
243 245
246config DM_THIN_PROVISIONING
247 tristate "Thin provisioning target (EXPERIMENTAL)"
248 depends on BLK_DEV_DM && EXPERIMENTAL
249 select DM_PERSISTENT_DATA
250 ---help---
251 Provides thin provisioning and snapshots that share a data store.
252
253config DM_DEBUG_BLOCK_STACK_TRACING
254 boolean "Keep stack trace of thin provisioning block lock holders"
255 depends on STACKTRACE_SUPPORT && DM_THIN_PROVISIONING
256 select STACKTRACE
257 ---help---
258 Enable this for messages that may help debug problems with the
259 block manager locking used by thin provisioning.
260
261 If unsure, say N.
262
263config DM_DEBUG_SPACE_MAPS
264 boolean "Extra validation for thin provisioning space maps"
265 depends on DM_THIN_PROVISIONING
266 ---help---
267 Enable this for messages that may help debug problems with the
268 space maps used by thin provisioning.
269
270 If unsure, say N.
271
244config DM_MIRROR 272config DM_MIRROR
245 tristate "Mirror target" 273 tristate "Mirror target"
246 depends on BLK_DEV_DM 274 depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 56661c4272f..046860c7a16 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -10,6 +10,7 @@ dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
10dm-mirror-y += dm-raid1.o 10dm-mirror-y += dm-raid1.o
11dm-log-userspace-y \ 11dm-log-userspace-y \
12 += dm-log-userspace-base.o dm-log-userspace-transfer.o 12 += dm-log-userspace-base.o dm-log-userspace-transfer.o
13dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
13md-mod-y += md.o bitmap.o 14md-mod-y += md.o bitmap.o
14raid456-y += raid5.o 15raid456-y += raid5.o
15 16
@@ -35,10 +36,12 @@ obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
35obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o 36obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
36obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o 37obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
37obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o 38obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
39obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/
38obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o 40obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
39obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o 41obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
40obj-$(CONFIG_DM_ZERO) += dm-zero.o 42obj-$(CONFIG_DM_ZERO) += dm-zero.o
41obj-$(CONFIG_DM_RAID) += dm-raid.o 43obj-$(CONFIG_DM_RAID) += dm-raid.o
44obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
42 45
43ifeq ($(CONFIG_DM_UEVENT),y) 46ifeq ($(CONFIG_DM_UEVENT),y)
44dm-mod-objs += dm-uevent.o 47dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
new file mode 100644
index 00000000000..59c4f0446ff
--- /dev/null
+++ b/drivers/md/dm-thin-metadata.c
@@ -0,0 +1,1391 @@
1/*
2 * Copyright (C) 2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-thin-metadata.h"
8#include "persistent-data/dm-btree.h"
9#include "persistent-data/dm-space-map.h"
10#include "persistent-data/dm-space-map-disk.h"
11#include "persistent-data/dm-transaction-manager.h"
12
13#include <linux/list.h>
14#include <linux/device-mapper.h>
15#include <linux/workqueue.h>
16
17/*--------------------------------------------------------------------------
18 * As far as the metadata goes, there is:
19 *
20 * - A superblock in block zero, taking up fewer than 512 bytes for
21 * atomic writes.
22 *
23 * - A space map managing the metadata blocks.
24 *
25 * - A space map managing the data blocks.
26 *
27 * - A btree mapping our internal thin dev ids onto struct disk_device_details.
28 *
29 * - A hierarchical btree, with 2 levels which effectively maps (thin
30 * dev id, virtual block) -> block_time. Block time is a 64-bit
31 * field holding the time in the low 24 bits, and block in the top 48
32 * bits.
33 *
34 * BTrees consist solely of btree_nodes, that fill a block. Some are
35 * internal nodes, as such their values are a __le64 pointing to other
36 * nodes. Leaf nodes can store data of any reasonable size (ie. much
37 * smaller than the block size). The nodes consist of the header,
38 * followed by an array of keys, followed by an array of values. We have
39 * to binary search on the keys so they're all held together to help the
40 * cpu cache.
41 *
42 * Space maps have 2 btrees:
43 *
44 * - One maps a uint64_t onto a struct index_entry. Which points to a
45 * bitmap block, and has some details about how many free entries there
46 * are etc.
47 *
48 * - The bitmap blocks have a header (for the checksum). Then the rest
49 * of the block is pairs of bits. With the meaning being:
50 *
51 * 0 - ref count is 0
52 * 1 - ref count is 1
53 * 2 - ref count is 2
54 * 3 - ref count is higher than 2
55 *
56 * - If the count is higher than 2 then the ref count is entered in a
57 * second btree that directly maps the block_address to a uint32_t ref
58 * count.
59 *
60 * The space map metadata variant doesn't have a bitmaps btree. Instead
61 * it has one single blocks worth of index_entries. This avoids
62 * recursive issues with the bitmap btree needing to allocate space in
63 * order to insert. With a small data block size such as 64k the
64 * metadata support data devices that are hundreds of terrabytes.
65 *
66 * The space maps allocate space linearly from front to back. Space that
67 * is freed in a transaction is never recycled within that transaction.
68 * To try and avoid fragmenting _free_ space the allocator always goes
69 * back and fills in gaps.
70 *
71 * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
72 * from the block manager.
73 *--------------------------------------------------------------------------*/
74
75#define DM_MSG_PREFIX "thin metadata"
76
77#define THIN_SUPERBLOCK_MAGIC 27022010
78#define THIN_SUPERBLOCK_LOCATION 0
79#define THIN_VERSION 1
80#define THIN_METADATA_CACHE_SIZE 64
81#define SECTOR_TO_BLOCK_SHIFT 3
82
83/* This should be plenty */
84#define SPACE_MAP_ROOT_SIZE 128
85
86/*
87 * Little endian on-disk superblock and device details.
88 */
89struct thin_disk_superblock {
90 __le32 csum; /* Checksum of superblock except for this field. */
91 __le32 flags;
92 __le64 blocknr; /* This block number, dm_block_t. */
93
94 __u8 uuid[16];
95 __le64 magic;
96 __le32 version;
97 __le32 time;
98
99 __le64 trans_id;
100
101 /*
102 * Root held by userspace transactions.
103 */
104 __le64 held_root;
105
106 __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
107 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
108
109 /*
110 * 2-level btree mapping (dev_id, (dev block, time)) -> data block
111 */
112 __le64 data_mapping_root;
113
114 /*
115 * Device detail root mapping dev_id -> device_details
116 */
117 __le64 device_details_root;
118
119 __le32 data_block_size; /* In 512-byte sectors. */
120
121 __le32 metadata_block_size; /* In 512-byte sectors. */
122 __le64 metadata_nr_blocks;
123
124 __le32 compat_flags;
125 __le32 compat_ro_flags;
126 __le32 incompat_flags;
127} __packed;
128
129struct disk_device_details {
130 __le64 mapped_blocks;
131 __le64 transaction_id; /* When created. */
132 __le32 creation_time;
133 __le32 snapshotted_time;
134} __packed;
135
136struct dm_pool_metadata {
137 struct hlist_node hash;
138
139 struct block_device *bdev;
140 struct dm_block_manager *bm;
141 struct dm_space_map *metadata_sm;
142 struct dm_space_map *data_sm;
143 struct dm_transaction_manager *tm;
144 struct dm_transaction_manager *nb_tm;
145
146 /*
147 * Two-level btree.
148 * First level holds thin_dev_t.
149 * Second level holds mappings.
150 */
151 struct dm_btree_info info;
152
153 /*
154 * Non-blocking version of the above.
155 */
156 struct dm_btree_info nb_info;
157
158 /*
159 * Just the top level for deleting whole devices.
160 */
161 struct dm_btree_info tl_info;
162
163 /*
164 * Just the bottom level for creating new devices.
165 */
166 struct dm_btree_info bl_info;
167
168 /*
169 * Describes the device details btree.
170 */
171 struct dm_btree_info details_info;
172
173 struct rw_semaphore root_lock;
174 uint32_t time;
175 int need_commit;
176 dm_block_t root;
177 dm_block_t details_root;
178 struct list_head thin_devices;
179 uint64_t trans_id;
180 unsigned long flags;
181 sector_t data_block_size;
182};
183
184struct dm_thin_device {
185 struct list_head list;
186 struct dm_pool_metadata *pmd;
187 dm_thin_id id;
188
189 int open_count;
190 int changed;
191 uint64_t mapped_blocks;
192 uint64_t transaction_id;
193 uint32_t creation_time;
194 uint32_t snapshotted_time;
195};
196
197/*----------------------------------------------------------------
198 * superblock validator
199 *--------------------------------------------------------------*/
200
201#define SUPERBLOCK_CSUM_XOR 160774
202
203static void sb_prepare_for_write(struct dm_block_validator *v,
204 struct dm_block *b,
205 size_t block_size)
206{
207 struct thin_disk_superblock *disk_super = dm_block_data(b);
208
209 disk_super->blocknr = cpu_to_le64(dm_block_location(b));
210 disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
211 block_size - sizeof(__le32),
212 SUPERBLOCK_CSUM_XOR));
213}
214
215static int sb_check(struct dm_block_validator *v,
216 struct dm_block *b,
217 size_t block_size)
218{
219 struct thin_disk_superblock *disk_super = dm_block_data(b);
220 __le32 csum_le;
221
222 if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
223 DMERR("sb_check failed: blocknr %llu: "
224 "wanted %llu", le64_to_cpu(disk_super->blocknr),
225 (unsigned long long)dm_block_location(b));
226 return -ENOTBLK;
227 }
228
229 if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
230 DMERR("sb_check failed: magic %llu: "
231 "wanted %llu", le64_to_cpu(disk_super->magic),
232 (unsigned long long)THIN_SUPERBLOCK_MAGIC);
233 return -EILSEQ;
234 }
235
236 csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
237 block_size - sizeof(__le32),
238 SUPERBLOCK_CSUM_XOR));
239 if (csum_le != disk_super->csum) {
240 DMERR("sb_check failed: csum %u: wanted %u",
241 le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
242 return -EILSEQ;
243 }
244
245 return 0;
246}
247
248static struct dm_block_validator sb_validator = {
249 .name = "superblock",
250 .prepare_for_write = sb_prepare_for_write,
251 .check = sb_check
252};
253
254/*----------------------------------------------------------------
255 * Methods for the btree value types
256 *--------------------------------------------------------------*/
257
258static uint64_t pack_block_time(dm_block_t b, uint32_t t)
259{
260 return (b << 24) | t;
261}
262
263static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
264{
265 *b = v >> 24;
266 *t = v & ((1 << 24) - 1);
267}
268
269static void data_block_inc(void *context, void *value_le)
270{
271 struct dm_space_map *sm = context;
272 __le64 v_le;
273 uint64_t b;
274 uint32_t t;
275
276 memcpy(&v_le, value_le, sizeof(v_le));
277 unpack_block_time(le64_to_cpu(v_le), &b, &t);
278 dm_sm_inc_block(sm, b);
279}
280
281static void data_block_dec(void *context, void *value_le)
282{
283 struct dm_space_map *sm = context;
284 __le64 v_le;
285 uint64_t b;
286 uint32_t t;
287
288 memcpy(&v_le, value_le, sizeof(v_le));
289 unpack_block_time(le64_to_cpu(v_le), &b, &t);
290 dm_sm_dec_block(sm, b);
291}
292
293static int data_block_equal(void *context, void *value1_le, void *value2_le)
294{
295 __le64 v1_le, v2_le;
296 uint64_t b1, b2;
297 uint32_t t;
298
299 memcpy(&v1_le, value1_le, sizeof(v1_le));
300 memcpy(&v2_le, value2_le, sizeof(v2_le));
301 unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
302 unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
303
304 return b1 == b2;
305}
306
307static void subtree_inc(void *context, void *value)
308{
309 struct dm_btree_info *info = context;
310 __le64 root_le;
311 uint64_t root;
312
313 memcpy(&root_le, value, sizeof(root_le));
314 root = le64_to_cpu(root_le);
315 dm_tm_inc(info->tm, root);
316}
317
318static void subtree_dec(void *context, void *value)
319{
320 struct dm_btree_info *info = context;
321 __le64 root_le;
322 uint64_t root;
323
324 memcpy(&root_le, value, sizeof(root_le));
325 root = le64_to_cpu(root_le);
326 if (dm_btree_del(info, root))
327 DMERR("btree delete failed\n");
328}
329
330static int subtree_equal(void *context, void *value1_le, void *value2_le)
331{
332 __le64 v1_le, v2_le;
333 memcpy(&v1_le, value1_le, sizeof(v1_le));
334 memcpy(&v2_le, value2_le, sizeof(v2_le));
335
336 return v1_le == v2_le;
337}
338
339/*----------------------------------------------------------------*/
340
341static int superblock_all_zeroes(struct dm_block_manager *bm, int *result)
342{
343 int r;
344 unsigned i;
345 struct dm_block *b;
346 __le64 *data_le, zero = cpu_to_le64(0);
347 unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64);
348
349 /*
350 * We can't use a validator here - it may be all zeroes.
351 */
352 r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
353 if (r)
354 return r;
355
356 data_le = dm_block_data(b);
357 *result = 1;
358 for (i = 0; i < block_size; i++) {
359 if (data_le[i] != zero) {
360 *result = 0;
361 break;
362 }
363 }
364
365 return dm_bm_unlock(b);
366}
367
368static int init_pmd(struct dm_pool_metadata *pmd,
369 struct dm_block_manager *bm,
370 dm_block_t nr_blocks, int create)
371{
372 int r;
373 struct dm_space_map *sm, *data_sm;
374 struct dm_transaction_manager *tm;
375 struct dm_block *sblock;
376
377 if (create) {
378 r = dm_tm_create_with_sm(bm, THIN_SUPERBLOCK_LOCATION,
379 &sb_validator, &tm, &sm, &sblock);
380 if (r < 0) {
381 DMERR("tm_create_with_sm failed");
382 return r;
383 }
384
385 data_sm = dm_sm_disk_create(tm, nr_blocks);
386 if (IS_ERR(data_sm)) {
387 DMERR("sm_disk_create failed");
388 r = PTR_ERR(data_sm);
389 goto bad;
390 }
391 } else {
392 struct thin_disk_superblock *disk_super = NULL;
393 size_t space_map_root_offset =
394 offsetof(struct thin_disk_superblock, metadata_space_map_root);
395
396 r = dm_tm_open_with_sm(bm, THIN_SUPERBLOCK_LOCATION,
397 &sb_validator, space_map_root_offset,
398 SPACE_MAP_ROOT_SIZE, &tm, &sm, &sblock);
399 if (r < 0) {
400 DMERR("tm_open_with_sm failed");
401 return r;
402 }
403
404 disk_super = dm_block_data(sblock);
405 data_sm = dm_sm_disk_open(tm, disk_super->data_space_map_root,
406 sizeof(disk_super->data_space_map_root));
407 if (IS_ERR(data_sm)) {
408 DMERR("sm_disk_open failed");
409 r = PTR_ERR(data_sm);
410 goto bad;
411 }
412 }
413
414
415 r = dm_tm_unlock(tm, sblock);
416 if (r < 0) {
417 DMERR("couldn't unlock superblock");
418 goto bad_data_sm;
419 }
420
421 pmd->bm = bm;
422 pmd->metadata_sm = sm;
423 pmd->data_sm = data_sm;
424 pmd->tm = tm;
425 pmd->nb_tm = dm_tm_create_non_blocking_clone(tm);
426 if (!pmd->nb_tm) {
427 DMERR("could not create clone tm");
428 r = -ENOMEM;
429 goto bad_data_sm;
430 }
431
432 pmd->info.tm = tm;
433 pmd->info.levels = 2;
434 pmd->info.value_type.context = pmd->data_sm;
435 pmd->info.value_type.size = sizeof(__le64);
436 pmd->info.value_type.inc = data_block_inc;
437 pmd->info.value_type.dec = data_block_dec;
438 pmd->info.value_type.equal = data_block_equal;
439
440 memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
441 pmd->nb_info.tm = pmd->nb_tm;
442
443 pmd->tl_info.tm = tm;
444 pmd->tl_info.levels = 1;
445 pmd->tl_info.value_type.context = &pmd->info;
446 pmd->tl_info.value_type.size = sizeof(__le64);
447 pmd->tl_info.value_type.inc = subtree_inc;
448 pmd->tl_info.value_type.dec = subtree_dec;
449 pmd->tl_info.value_type.equal = subtree_equal;
450
451 pmd->bl_info.tm = tm;
452 pmd->bl_info.levels = 1;
453 pmd->bl_info.value_type.context = pmd->data_sm;
454 pmd->bl_info.value_type.size = sizeof(__le64);
455 pmd->bl_info.value_type.inc = data_block_inc;
456 pmd->bl_info.value_type.dec = data_block_dec;
457 pmd->bl_info.value_type.equal = data_block_equal;
458
459 pmd->details_info.tm = tm;
460 pmd->details_info.levels = 1;
461 pmd->details_info.value_type.context = NULL;
462 pmd->details_info.value_type.size = sizeof(struct disk_device_details);
463 pmd->details_info.value_type.inc = NULL;
464 pmd->details_info.value_type.dec = NULL;
465 pmd->details_info.value_type.equal = NULL;
466
467 pmd->root = 0;
468
469 init_rwsem(&pmd->root_lock);
470 pmd->time = 0;
471 pmd->need_commit = 0;
472 pmd->details_root = 0;
473 pmd->trans_id = 0;
474 pmd->flags = 0;
475 INIT_LIST_HEAD(&pmd->thin_devices);
476
477 return 0;
478
479bad_data_sm:
480 dm_sm_destroy(data_sm);
481bad:
482 dm_tm_destroy(tm);
483 dm_sm_destroy(sm);
484
485 return r;
486}
487
488static int __begin_transaction(struct dm_pool_metadata *pmd)
489{
490 int r;
491 u32 features;
492 struct thin_disk_superblock *disk_super;
493 struct dm_block *sblock;
494
495 /*
496 * __maybe_commit_transaction() resets these
497 */
498 WARN_ON(pmd->need_commit);
499
500 /*
501 * We re-read the superblock every time. Shouldn't need to do this
502 * really.
503 */
504 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
505 &sb_validator, &sblock);
506 if (r)
507 return r;
508
509 disk_super = dm_block_data(sblock);
510 pmd->time = le32_to_cpu(disk_super->time);
511 pmd->root = le64_to_cpu(disk_super->data_mapping_root);
512 pmd->details_root = le64_to_cpu(disk_super->device_details_root);
513 pmd->trans_id = le64_to_cpu(disk_super->trans_id);
514 pmd->flags = le32_to_cpu(disk_super->flags);
515 pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
516
517 features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
518 if (features) {
519 DMERR("could not access metadata due to "
520 "unsupported optional features (%lx).",
521 (unsigned long)features);
522 r = -EINVAL;
523 goto out;
524 }
525
526 /*
527 * Check for read-only metadata to skip the following RDWR checks.
528 */
529 if (get_disk_ro(pmd->bdev->bd_disk))
530 goto out;
531
532 features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
533 if (features) {
534 DMERR("could not access metadata RDWR due to "
535 "unsupported optional features (%lx).",
536 (unsigned long)features);
537 r = -EINVAL;
538 }
539
540out:
541 dm_bm_unlock(sblock);
542 return r;
543}
544
545static int __write_changed_details(struct dm_pool_metadata *pmd)
546{
547 int r;
548 struct dm_thin_device *td, *tmp;
549 struct disk_device_details details;
550 uint64_t key;
551
552 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
553 if (!td->changed)
554 continue;
555
556 key = td->id;
557
558 details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
559 details.transaction_id = cpu_to_le64(td->transaction_id);
560 details.creation_time = cpu_to_le32(td->creation_time);
561 details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
562 __dm_bless_for_disk(&details);
563
564 r = dm_btree_insert(&pmd->details_info, pmd->details_root,
565 &key, &details, &pmd->details_root);
566 if (r)
567 return r;
568
569 if (td->open_count)
570 td->changed = 0;
571 else {
572 list_del(&td->list);
573 kfree(td);
574 }
575
576 pmd->need_commit = 1;
577 }
578
579 return 0;
580}
581
582static int __commit_transaction(struct dm_pool_metadata *pmd)
583{
584 /*
585 * FIXME: Associated pool should be made read-only on failure.
586 */
587 int r;
588 size_t metadata_len, data_len;
589 struct thin_disk_superblock *disk_super;
590 struct dm_block *sblock;
591
592 /*
593 * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
594 */
595 BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
596
597 r = __write_changed_details(pmd);
598 if (r < 0)
599 goto out;
600
601 if (!pmd->need_commit)
602 goto out;
603
604 r = dm_sm_commit(pmd->data_sm);
605 if (r < 0)
606 goto out;
607
608 r = dm_tm_pre_commit(pmd->tm);
609 if (r < 0)
610 goto out;
611
612 r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
613 if (r < 0)
614 goto out;
615
616 r = dm_sm_root_size(pmd->metadata_sm, &data_len);
617 if (r < 0)
618 goto out;
619
620 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
621 &sb_validator, &sblock);
622 if (r)
623 goto out;
624
625 disk_super = dm_block_data(sblock);
626 disk_super->time = cpu_to_le32(pmd->time);
627 disk_super->data_mapping_root = cpu_to_le64(pmd->root);
628 disk_super->device_details_root = cpu_to_le64(pmd->details_root);
629 disk_super->trans_id = cpu_to_le64(pmd->trans_id);
630 disk_super->flags = cpu_to_le32(pmd->flags);
631
632 r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
633 metadata_len);
634 if (r < 0)
635 goto out_locked;
636
637 r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
638 data_len);
639 if (r < 0)
640 goto out_locked;
641
642 r = dm_tm_commit(pmd->tm, sblock);
643 if (!r)
644 pmd->need_commit = 0;
645
646out:
647 return r;
648
649out_locked:
650 dm_bm_unlock(sblock);
651 return r;
652}
653
654struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
655 sector_t data_block_size)
656{
657 int r;
658 struct thin_disk_superblock *disk_super;
659 struct dm_pool_metadata *pmd;
660 sector_t bdev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
661 struct dm_block_manager *bm;
662 int create;
663 struct dm_block *sblock;
664
665 pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
666 if (!pmd) {
667 DMERR("could not allocate metadata struct");
668 return ERR_PTR(-ENOMEM);
669 }
670
671 /*
672 * Max hex locks:
673 * 3 for btree insert +
674 * 2 for btree lookup used within space map
675 */
676 bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE,
677 THIN_METADATA_CACHE_SIZE, 5);
678 if (!bm) {
679 DMERR("could not create block manager");
680 kfree(pmd);
681 return ERR_PTR(-ENOMEM);
682 }
683
684 r = superblock_all_zeroes(bm, &create);
685 if (r) {
686 dm_block_manager_destroy(bm);
687 kfree(pmd);
688 return ERR_PTR(r);
689 }
690
691
692 r = init_pmd(pmd, bm, 0, create);
693 if (r) {
694 dm_block_manager_destroy(bm);
695 kfree(pmd);
696 return ERR_PTR(r);
697 }
698 pmd->bdev = bdev;
699
700 if (!create) {
701 r = __begin_transaction(pmd);
702 if (r < 0)
703 goto bad;
704 return pmd;
705 }
706
707 /*
708 * Create.
709 */
710 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
711 &sb_validator, &sblock);
712 if (r)
713 goto bad;
714
715 disk_super = dm_block_data(sblock);
716 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
717 disk_super->version = cpu_to_le32(THIN_VERSION);
718 disk_super->time = 0;
719 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
720 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
721 disk_super->data_block_size = cpu_to_le32(data_block_size);
722
723 r = dm_bm_unlock(sblock);
724 if (r < 0)
725 goto bad;
726
727 r = dm_btree_empty(&pmd->info, &pmd->root);
728 if (r < 0)
729 goto bad;
730
731 r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
732 if (r < 0) {
733 DMERR("couldn't create devices root");
734 goto bad;
735 }
736
737 pmd->flags = 0;
738 pmd->need_commit = 1;
739 r = dm_pool_commit_metadata(pmd);
740 if (r < 0) {
741 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
742 __func__, r);
743 goto bad;
744 }
745
746 return pmd;
747
748bad:
749 if (dm_pool_metadata_close(pmd) < 0)
750 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
751 return ERR_PTR(r);
752}
753
754int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
755{
756 int r;
757 unsigned open_devices = 0;
758 struct dm_thin_device *td, *tmp;
759
760 down_read(&pmd->root_lock);
761 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
762 if (td->open_count)
763 open_devices++;
764 else {
765 list_del(&td->list);
766 kfree(td);
767 }
768 }
769 up_read(&pmd->root_lock);
770
771 if (open_devices) {
772 DMERR("attempt to close pmd when %u device(s) are still open",
773 open_devices);
774 return -EBUSY;
775 }
776
777 r = __commit_transaction(pmd);
778 if (r < 0)
779 DMWARN("%s: __commit_transaction() failed, error = %d",
780 __func__, r);
781
782 dm_tm_destroy(pmd->tm);
783 dm_tm_destroy(pmd->nb_tm);
784 dm_block_manager_destroy(pmd->bm);
785 dm_sm_destroy(pmd->metadata_sm);
786 dm_sm_destroy(pmd->data_sm);
787 kfree(pmd);
788
789 return 0;
790}
791
792static int __open_device(struct dm_pool_metadata *pmd,
793 dm_thin_id dev, int create,
794 struct dm_thin_device **td)
795{
796 int r, changed = 0;
797 struct dm_thin_device *td2;
798 uint64_t key = dev;
799 struct disk_device_details details_le;
800
801 /*
802 * Check the device isn't already open.
803 */
804 list_for_each_entry(td2, &pmd->thin_devices, list)
805 if (td2->id == dev) {
806 td2->open_count++;
807 *td = td2;
808 return 0;
809 }
810
811 /*
812 * Check the device exists.
813 */
814 r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
815 &key, &details_le);
816 if (r) {
817 if (r != -ENODATA || !create)
818 return r;
819
820 changed = 1;
821 details_le.mapped_blocks = 0;
822 details_le.transaction_id = cpu_to_le64(pmd->trans_id);
823 details_le.creation_time = cpu_to_le32(pmd->time);
824 details_le.snapshotted_time = cpu_to_le32(pmd->time);
825 }
826
827 *td = kmalloc(sizeof(**td), GFP_NOIO);
828 if (!*td)
829 return -ENOMEM;
830
831 (*td)->pmd = pmd;
832 (*td)->id = dev;
833 (*td)->open_count = 1;
834 (*td)->changed = changed;
835 (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
836 (*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
837 (*td)->creation_time = le32_to_cpu(details_le.creation_time);
838 (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
839
840 list_add(&(*td)->list, &pmd->thin_devices);
841
842 return 0;
843}
844
845static void __close_device(struct dm_thin_device *td)
846{
847 --td->open_count;
848}
849
850static int __create_thin(struct dm_pool_metadata *pmd,
851 dm_thin_id dev)
852{
853 int r;
854 dm_block_t dev_root;
855 uint64_t key = dev;
856 struct disk_device_details details_le;
857 struct dm_thin_device *td;
858 __le64 value;
859
860 r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
861 &key, &details_le);
862 if (!r)
863 return -EEXIST;
864
865 /*
866 * Create an empty btree for the mappings.
867 */
868 r = dm_btree_empty(&pmd->bl_info, &dev_root);
869 if (r)
870 return r;
871
872 /*
873 * Insert it into the main mapping tree.
874 */
875 value = cpu_to_le64(dev_root);
876 __dm_bless_for_disk(&value);
877 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
878 if (r) {
879 dm_btree_del(&pmd->bl_info, dev_root);
880 return r;
881 }
882
883 r = __open_device(pmd, dev, 1, &td);
884 if (r) {
885 __close_device(td);
886 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
887 dm_btree_del(&pmd->bl_info, dev_root);
888 return r;
889 }
890 td->changed = 1;
891 __close_device(td);
892
893 return r;
894}
895
896int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
897{
898 int r;
899
900 down_write(&pmd->root_lock);
901 r = __create_thin(pmd, dev);
902 up_write(&pmd->root_lock);
903
904 return r;
905}
906
907static int __set_snapshot_details(struct dm_pool_metadata *pmd,
908 struct dm_thin_device *snap,
909 dm_thin_id origin, uint32_t time)
910{
911 int r;
912 struct dm_thin_device *td;
913
914 r = __open_device(pmd, origin, 0, &td);
915 if (r)
916 return r;
917
918 td->changed = 1;
919 td->snapshotted_time = time;
920
921 snap->mapped_blocks = td->mapped_blocks;
922 snap->snapshotted_time = time;
923 __close_device(td);
924
925 return 0;
926}
927
928static int __create_snap(struct dm_pool_metadata *pmd,
929 dm_thin_id dev, dm_thin_id origin)
930{
931 int r;
932 dm_block_t origin_root;
933 uint64_t key = origin, dev_key = dev;
934 struct dm_thin_device *td;
935 struct disk_device_details details_le;
936 __le64 value;
937
938 /* check this device is unused */
939 r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
940 &dev_key, &details_le);
941 if (!r)
942 return -EEXIST;
943
944 /* find the mapping tree for the origin */
945 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
946 if (r)
947 return r;
948 origin_root = le64_to_cpu(value);
949
950 /* clone the origin, an inc will do */
951 dm_tm_inc(pmd->tm, origin_root);
952
953 /* insert into the main mapping tree */
954 value = cpu_to_le64(origin_root);
955 __dm_bless_for_disk(&value);
956 key = dev;
957 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
958 if (r) {
959 dm_tm_dec(pmd->tm, origin_root);
960 return r;
961 }
962
963 pmd->time++;
964
965 r = __open_device(pmd, dev, 1, &td);
966 if (r)
967 goto bad;
968
969 r = __set_snapshot_details(pmd, td, origin, pmd->time);
970 if (r)
971 goto bad;
972
973 __close_device(td);
974 return 0;
975
976bad:
977 __close_device(td);
978 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
979 dm_btree_remove(&pmd->details_info, pmd->details_root,
980 &key, &pmd->details_root);
981 return r;
982}
983
984int dm_pool_create_snap(struct dm_pool_metadata *pmd,
985 dm_thin_id dev,
986 dm_thin_id origin)
987{
988 int r;
989
990 down_write(&pmd->root_lock);
991 r = __create_snap(pmd, dev, origin);
992 up_write(&pmd->root_lock);
993
994 return r;
995}
996
997static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
998{
999 int r;
1000 uint64_t key = dev;
1001 struct dm_thin_device *td;
1002
1003 /* TODO: failure should mark the transaction invalid */
1004 r = __open_device(pmd, dev, 0, &td);
1005 if (r)
1006 return r;
1007
1008 if (td->open_count > 1) {
1009 __close_device(td);
1010 return -EBUSY;
1011 }
1012
1013 list_del(&td->list);
1014 kfree(td);
1015 r = dm_btree_remove(&pmd->details_info, pmd->details_root,
1016 &key, &pmd->details_root);
1017 if (r)
1018 return r;
1019
1020 r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1021 if (r)
1022 return r;
1023
1024 pmd->need_commit = 1;
1025
1026 return 0;
1027}
1028
1029int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
1030 dm_thin_id dev)
1031{
1032 int r;
1033
1034 down_write(&pmd->root_lock);
1035 r = __delete_device(pmd, dev);
1036 up_write(&pmd->root_lock);
1037
1038 return r;
1039}
1040
1041int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1042 uint64_t current_id,
1043 uint64_t new_id)
1044{
1045 down_write(&pmd->root_lock);
1046 if (pmd->trans_id != current_id) {
1047 up_write(&pmd->root_lock);
1048 DMERR("mismatched transaction id");
1049 return -EINVAL;
1050 }
1051
1052 pmd->trans_id = new_id;
1053 pmd->need_commit = 1;
1054 up_write(&pmd->root_lock);
1055
1056 return 0;
1057}
1058
1059int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
1060 uint64_t *result)
1061{
1062 down_read(&pmd->root_lock);
1063 *result = pmd->trans_id;
1064 up_read(&pmd->root_lock);
1065
1066 return 0;
1067}
1068
1069static int __get_held_metadata_root(struct dm_pool_metadata *pmd,
1070 dm_block_t *result)
1071{
1072 int r;
1073 struct thin_disk_superblock *disk_super;
1074 struct dm_block *sblock;
1075
1076 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
1077 &sb_validator, &sblock);
1078 if (r)
1079 return r;
1080
1081 disk_super = dm_block_data(sblock);
1082 *result = le64_to_cpu(disk_super->held_root);
1083
1084 return dm_bm_unlock(sblock);
1085}
1086
1087int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd,
1088 dm_block_t *result)
1089{
1090 int r;
1091
1092 down_read(&pmd->root_lock);
1093 r = __get_held_metadata_root(pmd, result);
1094 up_read(&pmd->root_lock);
1095
1096 return r;
1097}
1098
1099int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
1100 struct dm_thin_device **td)
1101{
1102 int r;
1103
1104 down_write(&pmd->root_lock);
1105 r = __open_device(pmd, dev, 0, td);
1106 up_write(&pmd->root_lock);
1107
1108 return r;
1109}
1110
1111int dm_pool_close_thin_device(struct dm_thin_device *td)
1112{
1113 down_write(&td->pmd->root_lock);
1114 __close_device(td);
1115 up_write(&td->pmd->root_lock);
1116
1117 return 0;
1118}
1119
1120dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1121{
1122 return td->id;
1123}
1124
1125static int __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1126{
1127 return td->snapshotted_time > time;
1128}
1129
1130int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1131 int can_block, struct dm_thin_lookup_result *result)
1132{
1133 int r;
1134 uint64_t block_time = 0;
1135 __le64 value;
1136 struct dm_pool_metadata *pmd = td->pmd;
1137 dm_block_t keys[2] = { td->id, block };
1138
1139 if (can_block) {
1140 down_read(&pmd->root_lock);
1141 r = dm_btree_lookup(&pmd->info, pmd->root, keys, &value);
1142 if (!r)
1143 block_time = le64_to_cpu(value);
1144 up_read(&pmd->root_lock);
1145
1146 } else if (down_read_trylock(&pmd->root_lock)) {
1147 r = dm_btree_lookup(&pmd->nb_info, pmd->root, keys, &value);
1148 if (!r)
1149 block_time = le64_to_cpu(value);
1150 up_read(&pmd->root_lock);
1151
1152 } else
1153 return -EWOULDBLOCK;
1154
1155 if (!r) {
1156 dm_block_t exception_block;
1157 uint32_t exception_time;
1158 unpack_block_time(block_time, &exception_block,
1159 &exception_time);
1160 result->block = exception_block;
1161 result->shared = __snapshotted_since(td, exception_time);
1162 }
1163
1164 return r;
1165}
1166
1167static int __insert(struct dm_thin_device *td, dm_block_t block,
1168 dm_block_t data_block)
1169{
1170 int r, inserted;
1171 __le64 value;
1172 struct dm_pool_metadata *pmd = td->pmd;
1173 dm_block_t keys[2] = { td->id, block };
1174
1175 pmd->need_commit = 1;
1176 value = cpu_to_le64(pack_block_time(data_block, pmd->time));
1177 __dm_bless_for_disk(&value);
1178
1179 r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
1180 &pmd->root, &inserted);
1181 if (r)
1182 return r;
1183
1184 if (inserted) {
1185 td->mapped_blocks++;
1186 td->changed = 1;
1187 }
1188
1189 return 0;
1190}
1191
1192int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
1193 dm_block_t data_block)
1194{
1195 int r;
1196
1197 down_write(&td->pmd->root_lock);
1198 r = __insert(td, block, data_block);
1199 up_write(&td->pmd->root_lock);
1200
1201 return r;
1202}
1203
1204static int __remove(struct dm_thin_device *td, dm_block_t block)
1205{
1206 int r;
1207 struct dm_pool_metadata *pmd = td->pmd;
1208 dm_block_t keys[2] = { td->id, block };
1209
1210 r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root);
1211 if (r)
1212 return r;
1213
1214 pmd->need_commit = 1;
1215
1216 return 0;
1217}
1218
1219int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
1220{
1221 int r;
1222
1223 down_write(&td->pmd->root_lock);
1224 r = __remove(td, block);
1225 up_write(&td->pmd->root_lock);
1226
1227 return r;
1228}
1229
1230int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1231{
1232 int r;
1233
1234 down_write(&pmd->root_lock);
1235
1236 r = dm_sm_new_block(pmd->data_sm, result);
1237 pmd->need_commit = 1;
1238
1239 up_write(&pmd->root_lock);
1240
1241 return r;
1242}
1243
1244int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1245{
1246 int r;
1247
1248 down_write(&pmd->root_lock);
1249
1250 r = __commit_transaction(pmd);
1251 if (r <= 0)
1252 goto out;
1253
1254 /*
1255 * Open the next transaction.
1256 */
1257 r = __begin_transaction(pmd);
1258out:
1259 up_write(&pmd->root_lock);
1260 return r;
1261}
1262
1263int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
1264{
1265 int r;
1266
1267 down_read(&pmd->root_lock);
1268 r = dm_sm_get_nr_free(pmd->data_sm, result);
1269 up_read(&pmd->root_lock);
1270
1271 return r;
1272}
1273
1274int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1275 dm_block_t *result)
1276{
1277 int r;
1278
1279 down_read(&pmd->root_lock);
1280 r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1281 up_read(&pmd->root_lock);
1282
1283 return r;
1284}
1285
1286int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
1287 dm_block_t *result)
1288{
1289 int r;
1290
1291 down_read(&pmd->root_lock);
1292 r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
1293 up_read(&pmd->root_lock);
1294
1295 return r;
1296}
1297
1298int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result)
1299{
1300 down_read(&pmd->root_lock);
1301 *result = pmd->data_block_size;
1302 up_read(&pmd->root_lock);
1303
1304 return 0;
1305}
1306
1307int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1308{
1309 int r;
1310
1311 down_read(&pmd->root_lock);
1312 r = dm_sm_get_nr_blocks(pmd->data_sm, result);
1313 up_read(&pmd->root_lock);
1314
1315 return r;
1316}
1317
1318int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
1319{
1320 struct dm_pool_metadata *pmd = td->pmd;
1321
1322 down_read(&pmd->root_lock);
1323 *result = td->mapped_blocks;
1324 up_read(&pmd->root_lock);
1325
1326 return 0;
1327}
1328
1329static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
1330{
1331 int r;
1332 __le64 value_le;
1333 dm_block_t thin_root;
1334 struct dm_pool_metadata *pmd = td->pmd;
1335
1336 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
1337 if (r)
1338 return r;
1339
1340 thin_root = le64_to_cpu(value_le);
1341
1342 return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
1343}
1344
1345int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
1346 dm_block_t *result)
1347{
1348 int r;
1349 struct dm_pool_metadata *pmd = td->pmd;
1350
1351 down_read(&pmd->root_lock);
1352 r = __highest_block(td, result);
1353 up_read(&pmd->root_lock);
1354
1355 return r;
1356}
1357
1358static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1359{
1360 int r;
1361 dm_block_t old_count;
1362
1363 r = dm_sm_get_nr_blocks(pmd->data_sm, &old_count);
1364 if (r)
1365 return r;
1366
1367 if (new_count == old_count)
1368 return 0;
1369
1370 if (new_count < old_count) {
1371 DMERR("cannot reduce size of data device");
1372 return -EINVAL;
1373 }
1374
1375 r = dm_sm_extend(pmd->data_sm, new_count - old_count);
1376 if (!r)
1377 pmd->need_commit = 1;
1378
1379 return r;
1380}
1381
1382int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1383{
1384 int r;
1385
1386 down_write(&pmd->root_lock);
1387 r = __resize_data_dev(pmd, new_count);
1388 up_write(&pmd->root_lock);
1389
1390 return r;
1391}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
new file mode 100644
index 00000000000..859c1689687
--- /dev/null
+++ b/drivers/md/dm-thin-metadata.h
@@ -0,0 +1,156 @@
1/*
2 * Copyright (C) 2010-2011 Red Hat, Inc.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_THIN_METADATA_H
8#define DM_THIN_METADATA_H
9
10#include "persistent-data/dm-block-manager.h"
11
12#define THIN_METADATA_BLOCK_SIZE 4096
13
14/*----------------------------------------------------------------*/
15
16struct dm_pool_metadata;
17struct dm_thin_device;
18
19/*
20 * Device identifier
21 */
22typedef uint64_t dm_thin_id;
23
24/*
25 * Reopens or creates a new, empty metadata volume.
26 */
27struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
28 sector_t data_block_size);
29
30int dm_pool_metadata_close(struct dm_pool_metadata *pmd);
31
32/*
33 * Compat feature flags. Any incompat flags beyond the ones
34 * specified below will prevent use of the thin metadata.
35 */
36#define THIN_FEATURE_COMPAT_SUPP 0UL
37#define THIN_FEATURE_COMPAT_RO_SUPP 0UL
38#define THIN_FEATURE_INCOMPAT_SUPP 0UL
39
40/*
41 * Device creation/deletion.
42 */
43int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev);
44
45/*
46 * An internal snapshot.
47 *
48 * You can only snapshot a quiesced origin i.e. one that is either
49 * suspended or not instanced at all.
50 */
51int dm_pool_create_snap(struct dm_pool_metadata *pmd, dm_thin_id dev,
52 dm_thin_id origin);
53
54/*
55 * Deletes a virtual device from the metadata. It _is_ safe to call this
56 * when that device is open. Operations on that device will just start
57 * failing. You still need to call close() on the device.
58 */
59int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
60 dm_thin_id dev);
61
62/*
63 * Commits _all_ metadata changes: device creation, deletion, mapping
64 * updates.
65 */
66int dm_pool_commit_metadata(struct dm_pool_metadata *pmd);
67
68/*
69 * Set/get userspace transaction id.
70 */
71int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
72 uint64_t current_id,
73 uint64_t new_id);
74
75int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
76 uint64_t *result);
77
78/*
79 * Hold/get root for userspace transaction.
80 */
81int dm_pool_hold_metadata_root(struct dm_pool_metadata *pmd);
82
83int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd,
84 dm_block_t *result);
85
86/*
87 * Actions on a single virtual device.
88 */
89
90/*
91 * Opening the same device more than once will fail with -EBUSY.
92 */
93int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
94 struct dm_thin_device **td);
95
96int dm_pool_close_thin_device(struct dm_thin_device *td);
97
98dm_thin_id dm_thin_dev_id(struct dm_thin_device *td);
99
100struct dm_thin_lookup_result {
101 dm_block_t block;
102 int shared;
103};
104
105/*
106 * Returns:
107 * -EWOULDBLOCK iff @can_block is set and would block.
108 * -ENODATA iff that mapping is not present.
109 * 0 success
110 */
111int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
112 int can_block, struct dm_thin_lookup_result *result);
113
114/*
115 * Obtain an unused block.
116 */
117int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result);
118
119/*
120 * Insert or remove block.
121 */
122int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
123 dm_block_t data_block);
124
125int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
126
127/*
128 * Queries.
129 */
130int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
131 dm_block_t *highest_mapped);
132
133int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result);
134
135int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd,
136 dm_block_t *result);
137
138int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
139 dm_block_t *result);
140
141int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
142 dm_block_t *result);
143
144int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result);
145
146int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
147
148/*
149 * Returns -ENOSPC if the new size is too small and already allocated
150 * blocks would be lost.
151 */
152int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size);
153
154/*----------------------------------------------------------------*/
155
156#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
new file mode 100644
index 00000000000..c3087575fef
--- /dev/null
+++ b/drivers/md/dm-thin.c
@@ -0,0 +1,2428 @@
1/*
2 * Copyright (C) 2011 Red Hat UK.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-thin-metadata.h"
8
9#include <linux/device-mapper.h>
10#include <linux/dm-io.h>
11#include <linux/dm-kcopyd.h>
12#include <linux/list.h>
13#include <linux/init.h>
14#include <linux/module.h>
15#include <linux/slab.h>
16
17#define DM_MSG_PREFIX "thin"
18
19/*
20 * Tunable constants
21 */
22#define ENDIO_HOOK_POOL_SIZE 10240
23#define DEFERRED_SET_SIZE 64
24#define MAPPING_POOL_SIZE 1024
25#define PRISON_CELLS 1024
26
27/*
28 * The block size of the device holding pool data must be
29 * between 64KB and 1GB.
30 */
31#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
32#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
33
34/*
35 * The metadata device is currently limited in size. The limitation is
36 * checked lower down in dm-space-map-metadata, but we also check it here
37 * so we can fail early.
38 *
39 * We have one block of index, which can hold 255 index entries. Each
40 * index entry contains allocation info about 16k metadata blocks.
41 */
42#define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
43
44/*
45 * Device id is restricted to 24 bits.
46 */
47#define MAX_DEV_ID ((1 << 24) - 1)
48
49/*
50 * How do we handle breaking sharing of data blocks?
51 * =================================================
52 *
53 * We use a standard copy-on-write btree to store the mappings for the
54 * devices (note I'm talking about copy-on-write of the metadata here, not
55 * the data). When you take an internal snapshot you clone the root node
56 * of the origin btree. After this there is no concept of an origin or a
57 * snapshot. They are just two device trees that happen to point to the
58 * same data blocks.
59 *
60 * When we get a write in we decide if it's to a shared data block using
61 * some timestamp magic. If it is, we have to break sharing.
62 *
63 * Let's say we write to a shared block in what was the origin. The
64 * steps are:
65 *
66 * i) plug io further to this physical block. (see bio_prison code).
67 *
68 * ii) quiesce any read io to that shared data block. Obviously
69 * including all devices that share this block. (see deferred_set code)
70 *
71 * iii) copy the data block to a newly allocate block. This step can be
72 * missed out if the io covers the block. (schedule_copy).
73 *
74 * iv) insert the new mapping into the origin's btree
75 * (process_prepared_mappings). This act of inserting breaks some
76 * sharing of btree nodes between the two devices. Breaking sharing only
77 * effects the btree of that specific device. Btrees for the other
78 * devices that share the block never change. The btree for the origin
79 * device as it was after the last commit is untouched, ie. we're using
80 * persistent data structures in the functional programming sense.
81 *
82 * v) unplug io to this physical block, including the io that triggered
83 * the breaking of sharing.
84 *
85 * Steps (ii) and (iii) occur in parallel.
86 *
87 * The metadata _doesn't_ need to be committed before the io continues. We
88 * get away with this because the io is always written to a _new_ block.
89 * If there's a crash, then:
90 *
91 * - The origin mapping will point to the old origin block (the shared
92 * one). This will contain the data as it was before the io that triggered
93 * the breaking of sharing came in.
94 *
95 * - The snap mapping still points to the old block. As it would after
96 * the commit.
97 *
98 * The downside of this scheme is the timestamp magic isn't perfect, and
99 * will continue to think that data block in the snapshot device is shared
100 * even after the write to the origin has broken sharing. I suspect data
101 * blocks will typically be shared by many different devices, so we're
102 * breaking sharing n + 1 times, rather than n, where n is the number of
103 * devices that reference this data block. At the moment I think the
104 * benefits far, far outweigh the disadvantages.
105 */
106
107/*----------------------------------------------------------------*/
108
109/*
110 * Sometimes we can't deal with a bio straight away. We put them in prison
111 * where they can't cause any mischief. Bios are put in a cell identified
112 * by a key, multiple bios can be in the same cell. When the cell is
113 * subsequently unlocked the bios become available.
114 */
115struct bio_prison;
116
117struct cell_key {
118 int virtual;
119 dm_thin_id dev;
120 dm_block_t block;
121};
122
123struct cell {
124 struct hlist_node list;
125 struct bio_prison *prison;
126 struct cell_key key;
127 unsigned count;
128 struct bio_list bios;
129};
130
131struct bio_prison {
132 spinlock_t lock;
133 mempool_t *cell_pool;
134
135 unsigned nr_buckets;
136 unsigned hash_mask;
137 struct hlist_head *cells;
138};
139
140static uint32_t calc_nr_buckets(unsigned nr_cells)
141{
142 uint32_t n = 128;
143
144 nr_cells /= 4;
145 nr_cells = min(nr_cells, 8192u);
146
147 while (n < nr_cells)
148 n <<= 1;
149
150 return n;
151}
152
153/*
154 * @nr_cells should be the number of cells you want in use _concurrently_.
155 * Don't confuse it with the number of distinct keys.
156 */
157static struct bio_prison *prison_create(unsigned nr_cells)
158{
159 unsigned i;
160 uint32_t nr_buckets = calc_nr_buckets(nr_cells);
161 size_t len = sizeof(struct bio_prison) +
162 (sizeof(struct hlist_head) * nr_buckets);
163 struct bio_prison *prison = kmalloc(len, GFP_KERNEL);
164
165 if (!prison)
166 return NULL;
167
168 spin_lock_init(&prison->lock);
169 prison->cell_pool = mempool_create_kmalloc_pool(nr_cells,
170 sizeof(struct cell));
171 if (!prison->cell_pool) {
172 kfree(prison);
173 return NULL;
174 }
175
176 prison->nr_buckets = nr_buckets;
177 prison->hash_mask = nr_buckets - 1;
178 prison->cells = (struct hlist_head *) (prison + 1);
179 for (i = 0; i < nr_buckets; i++)
180 INIT_HLIST_HEAD(prison->cells + i);
181
182 return prison;
183}
184
185static void prison_destroy(struct bio_prison *prison)
186{
187 mempool_destroy(prison->cell_pool);
188 kfree(prison);
189}
190
191static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key)
192{
193 const unsigned long BIG_PRIME = 4294967291UL;
194 uint64_t hash = key->block * BIG_PRIME;
195
196 return (uint32_t) (hash & prison->hash_mask);
197}
198
199static int keys_equal(struct cell_key *lhs, struct cell_key *rhs)
200{
201 return (lhs->virtual == rhs->virtual) &&
202 (lhs->dev == rhs->dev) &&
203 (lhs->block == rhs->block);
204}
205
206static struct cell *__search_bucket(struct hlist_head *bucket,
207 struct cell_key *key)
208{
209 struct cell *cell;
210 struct hlist_node *tmp;
211
212 hlist_for_each_entry(cell, tmp, bucket, list)
213 if (keys_equal(&cell->key, key))
214 return cell;
215
216 return NULL;
217}
218
219/*
220 * This may block if a new cell needs allocating. You must ensure that
221 * cells will be unlocked even if the calling thread is blocked.
222 *
223 * Returns the number of entries in the cell prior to the new addition
224 * or < 0 on failure.
225 */
226static int bio_detain(struct bio_prison *prison, struct cell_key *key,
227 struct bio *inmate, struct cell **ref)
228{
229 int r;
230 unsigned long flags;
231 uint32_t hash = hash_key(prison, key);
232 struct cell *uninitialized_var(cell), *cell2 = NULL;
233
234 BUG_ON(hash > prison->nr_buckets);
235
236 spin_lock_irqsave(&prison->lock, flags);
237 cell = __search_bucket(prison->cells + hash, key);
238
239 if (!cell) {
240 /*
241 * Allocate a new cell
242 */
243 spin_unlock_irqrestore(&prison->lock, flags);
244 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
245 spin_lock_irqsave(&prison->lock, flags);
246
247 /*
248 * We've been unlocked, so we have to double check that
249 * nobody else has inserted this cell in the meantime.
250 */
251 cell = __search_bucket(prison->cells + hash, key);
252
253 if (!cell) {
254 cell = cell2;
255 cell2 = NULL;
256
257 cell->prison = prison;
258 memcpy(&cell->key, key, sizeof(cell->key));
259 cell->count = 0;
260 bio_list_init(&cell->bios);
261 hlist_add_head(&cell->list, prison->cells + hash);
262 }
263 }
264
265 r = cell->count++;
266 bio_list_add(&cell->bios, inmate);
267 spin_unlock_irqrestore(&prison->lock, flags);
268
269 if (cell2)
270 mempool_free(cell2, prison->cell_pool);
271
272 *ref = cell;
273
274 return r;
275}
276
277/*
278 * @inmates must have been initialised prior to this call
279 */
280static void __cell_release(struct cell *cell, struct bio_list *inmates)
281{
282 struct bio_prison *prison = cell->prison;
283
284 hlist_del(&cell->list);
285
286 if (inmates)
287 bio_list_merge(inmates, &cell->bios);
288
289 mempool_free(cell, prison->cell_pool);
290}
291
292static void cell_release(struct cell *cell, struct bio_list *bios)
293{
294 unsigned long flags;
295 struct bio_prison *prison = cell->prison;
296
297 spin_lock_irqsave(&prison->lock, flags);
298 __cell_release(cell, bios);
299 spin_unlock_irqrestore(&prison->lock, flags);
300}
301
302/*
303 * There are a couple of places where we put a bio into a cell briefly
304 * before taking it out again. In these situations we know that no other
305 * bio may be in the cell. This function releases the cell, and also does
306 * a sanity check.
307 */
308static void cell_release_singleton(struct cell *cell, struct bio *bio)
309{
310 struct bio_prison *prison = cell->prison;
311 struct bio_list bios;
312 struct bio *b;
313 unsigned long flags;
314
315 bio_list_init(&bios);
316
317 spin_lock_irqsave(&prison->lock, flags);
318 __cell_release(cell, &bios);
319 spin_unlock_irqrestore(&prison->lock, flags);
320
321 b = bio_list_pop(&bios);
322 BUG_ON(b != bio);
323 BUG_ON(!bio_list_empty(&bios));
324}
325
326static void cell_error(struct cell *cell)
327{
328 struct bio_prison *prison = cell->prison;
329 struct bio_list bios;
330 struct bio *bio;
331 unsigned long flags;
332
333 bio_list_init(&bios);
334
335 spin_lock_irqsave(&prison->lock, flags);
336 __cell_release(cell, &bios);
337 spin_unlock_irqrestore(&prison->lock, flags);
338
339 while ((bio = bio_list_pop(&bios)))
340 bio_io_error(bio);
341}
342
343/*----------------------------------------------------------------*/
344
345/*
346 * We use the deferred set to keep track of pending reads to shared blocks.
347 * We do this to ensure the new mapping caused by a write isn't performed
348 * until these prior reads have completed. Otherwise the insertion of the
349 * new mapping could free the old block that the read bios are mapped to.
350 */
351
352struct deferred_set;
353struct deferred_entry {
354 struct deferred_set *ds;
355 unsigned count;
356 struct list_head work_items;
357};
358
359struct deferred_set {
360 spinlock_t lock;
361 unsigned current_entry;
362 unsigned sweeper;
363 struct deferred_entry entries[DEFERRED_SET_SIZE];
364};
365
366static void ds_init(struct deferred_set *ds)
367{
368 int i;
369
370 spin_lock_init(&ds->lock);
371 ds->current_entry = 0;
372 ds->sweeper = 0;
373 for (i = 0; i < DEFERRED_SET_SIZE; i++) {
374 ds->entries[i].ds = ds;
375 ds->entries[i].count = 0;
376 INIT_LIST_HEAD(&ds->entries[i].work_items);
377 }
378}
379
380static struct deferred_entry *ds_inc(struct deferred_set *ds)
381{
382 unsigned long flags;
383 struct deferred_entry *entry;
384
385 spin_lock_irqsave(&ds->lock, flags);
386 entry = ds->entries + ds->current_entry;
387 entry->count++;
388 spin_unlock_irqrestore(&ds->lock, flags);
389
390 return entry;
391}
392
393static unsigned ds_next(unsigned index)
394{
395 return (index + 1) % DEFERRED_SET_SIZE;
396}
397
398static void __sweep(struct deferred_set *ds, struct list_head *head)
399{
400 while ((ds->sweeper != ds->current_entry) &&
401 !ds->entries[ds->sweeper].count) {
402 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
403 ds->sweeper = ds_next(ds->sweeper);
404 }
405
406 if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count)
407 list_splice_init(&ds->entries[ds->sweeper].work_items, head);
408}
409
410static void ds_dec(struct deferred_entry *entry, struct list_head *head)
411{
412 unsigned long flags;
413
414 spin_lock_irqsave(&entry->ds->lock, flags);
415 BUG_ON(!entry->count);
416 --entry->count;
417 __sweep(entry->ds, head);
418 spin_unlock_irqrestore(&entry->ds->lock, flags);
419}
420
421/*
422 * Returns 1 if deferred or 0 if no pending items to delay job.
423 */
424static int ds_add_work(struct deferred_set *ds, struct list_head *work)
425{
426 int r = 1;
427 unsigned long flags;
428 unsigned next_entry;
429
430 spin_lock_irqsave(&ds->lock, flags);
431 if ((ds->sweeper == ds->current_entry) &&
432 !ds->entries[ds->current_entry].count)
433 r = 0;
434 else {
435 list_add(work, &ds->entries[ds->current_entry].work_items);
436 next_entry = ds_next(ds->current_entry);
437 if (!ds->entries[next_entry].count)
438 ds->current_entry = next_entry;
439 }
440 spin_unlock_irqrestore(&ds->lock, flags);
441
442 return r;
443}
444
445/*----------------------------------------------------------------*/
446
447/*
448 * Key building.
449 */
450static void build_data_key(struct dm_thin_device *td,
451 dm_block_t b, struct cell_key *key)
452{
453 key->virtual = 0;
454 key->dev = dm_thin_dev_id(td);
455 key->block = b;
456}
457
458static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
459 struct cell_key *key)
460{
461 key->virtual = 1;
462 key->dev = dm_thin_dev_id(td);
463 key->block = b;
464}
465
466/*----------------------------------------------------------------*/
467
468/*
469 * A pool device ties together a metadata device and a data device. It
470 * also provides the interface for creating and destroying internal
471 * devices.
472 */
473struct new_mapping;
474struct pool {
475 struct list_head list;
476 struct dm_target *ti; /* Only set if a pool target is bound */
477
478 struct mapped_device *pool_md;
479 struct block_device *md_dev;
480 struct dm_pool_metadata *pmd;
481
482 uint32_t sectors_per_block;
483 unsigned block_shift;
484 dm_block_t offset_mask;
485 dm_block_t low_water_blocks;
486
487 unsigned zero_new_blocks:1;
488 unsigned low_water_triggered:1; /* A dm event has been sent */
489 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
490
491 struct bio_prison *prison;
492 struct dm_kcopyd_client *copier;
493
494 struct workqueue_struct *wq;
495 struct work_struct worker;
496
497 unsigned ref_count;
498
499 spinlock_t lock;
500 struct bio_list deferred_bios;
501 struct bio_list deferred_flush_bios;
502 struct list_head prepared_mappings;
503
504 struct bio_list retry_on_resume_list;
505
506 struct deferred_set ds; /* FIXME: move to thin_c */
507
508 struct new_mapping *next_mapping;
509 mempool_t *mapping_pool;
510 mempool_t *endio_hook_pool;
511};
512
513/*
514 * Target context for a pool.
515 */
516struct pool_c {
517 struct dm_target *ti;
518 struct pool *pool;
519 struct dm_dev *data_dev;
520 struct dm_dev *metadata_dev;
521 struct dm_target_callbacks callbacks;
522
523 dm_block_t low_water_blocks;
524 unsigned zero_new_blocks:1;
525};
526
527/*
528 * Target context for a thin.
529 */
530struct thin_c {
531 struct dm_dev *pool_dev;
532 dm_thin_id dev_id;
533
534 struct pool *pool;
535 struct dm_thin_device *td;
536};
537
538/*----------------------------------------------------------------*/
539
540/*
541 * A global list of pools that uses a struct mapped_device as a key.
542 */
543static struct dm_thin_pool_table {
544 struct mutex mutex;
545 struct list_head pools;
546} dm_thin_pool_table;
547
548static void pool_table_init(void)
549{
550 mutex_init(&dm_thin_pool_table.mutex);
551 INIT_LIST_HEAD(&dm_thin_pool_table.pools);
552}
553
554static void __pool_table_insert(struct pool *pool)
555{
556 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
557 list_add(&pool->list, &dm_thin_pool_table.pools);
558}
559
560static void __pool_table_remove(struct pool *pool)
561{
562 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
563 list_del(&pool->list);
564}
565
566static struct pool *__pool_table_lookup(struct mapped_device *md)
567{
568 struct pool *pool = NULL, *tmp;
569
570 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
571
572 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
573 if (tmp->pool_md == md) {
574 pool = tmp;
575 break;
576 }
577 }
578
579 return pool;
580}
581
582static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
583{
584 struct pool *pool = NULL, *tmp;
585
586 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
587
588 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
589 if (tmp->md_dev == md_dev) {
590 pool = tmp;
591 break;
592 }
593 }
594
595 return pool;
596}
597
598/*----------------------------------------------------------------*/
599
600static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
601{
602 struct bio *bio;
603 struct bio_list bios;
604
605 bio_list_init(&bios);
606 bio_list_merge(&bios, master);
607 bio_list_init(master);
608
609 while ((bio = bio_list_pop(&bios))) {
610 if (dm_get_mapinfo(bio)->ptr == tc)
611 bio_endio(bio, DM_ENDIO_REQUEUE);
612 else
613 bio_list_add(master, bio);
614 }
615}
616
617static void requeue_io(struct thin_c *tc)
618{
619 struct pool *pool = tc->pool;
620 unsigned long flags;
621
622 spin_lock_irqsave(&pool->lock, flags);
623 __requeue_bio_list(tc, &pool->deferred_bios);
624 __requeue_bio_list(tc, &pool->retry_on_resume_list);
625 spin_unlock_irqrestore(&pool->lock, flags);
626}
627
628/*
629 * This section of code contains the logic for processing a thin device's IO.
630 * Much of the code depends on pool object resources (lists, workqueues, etc)
631 * but most is exclusively called from the thin target rather than the thin-pool
632 * target.
633 */
634
635static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
636{
637 return bio->bi_sector >> tc->pool->block_shift;
638}
639
640static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
641{
642 struct pool *pool = tc->pool;
643
644 bio->bi_bdev = tc->pool_dev->bdev;
645 bio->bi_sector = (block << pool->block_shift) +
646 (bio->bi_sector & pool->offset_mask);
647}
648
649static void remap_and_issue(struct thin_c *tc, struct bio *bio,
650 dm_block_t block)
651{
652 struct pool *pool = tc->pool;
653 unsigned long flags;
654
655 remap(tc, bio, block);
656
657 /*
658 * Batch together any FUA/FLUSH bios we find and then issue
659 * a single commit for them in process_deferred_bios().
660 */
661 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
662 spin_lock_irqsave(&pool->lock, flags);
663 bio_list_add(&pool->deferred_flush_bios, bio);
664 spin_unlock_irqrestore(&pool->lock, flags);
665 } else
666 generic_make_request(bio);
667}
668
669/*
670 * wake_worker() is used when new work is queued and when pool_resume is
671 * ready to continue deferred IO processing.
672 */
673static void wake_worker(struct pool *pool)
674{
675 queue_work(pool->wq, &pool->worker);
676}
677
678/*----------------------------------------------------------------*/
679
680/*
681 * Bio endio functions.
682 */
683struct endio_hook {
684 struct thin_c *tc;
685 bio_end_io_t *saved_bi_end_io;
686 struct deferred_entry *entry;
687};
688
689struct new_mapping {
690 struct list_head list;
691
692 int prepared;
693
694 struct thin_c *tc;
695 dm_block_t virt_block;
696 dm_block_t data_block;
697 struct cell *cell;
698 int err;
699
700 /*
701 * If the bio covers the whole area of a block then we can avoid
702 * zeroing or copying. Instead this bio is hooked. The bio will
703 * still be in the cell, so care has to be taken to avoid issuing
704 * the bio twice.
705 */
706 struct bio *bio;
707 bio_end_io_t *saved_bi_end_io;
708};
709
710static void __maybe_add_mapping(struct new_mapping *m)
711{
712 struct pool *pool = m->tc->pool;
713
714 if (list_empty(&m->list) && m->prepared) {
715 list_add(&m->list, &pool->prepared_mappings);
716 wake_worker(pool);
717 }
718}
719
720static void copy_complete(int read_err, unsigned long write_err, void *context)
721{
722 unsigned long flags;
723 struct new_mapping *m = context;
724 struct pool *pool = m->tc->pool;
725
726 m->err = read_err || write_err ? -EIO : 0;
727
728 spin_lock_irqsave(&pool->lock, flags);
729 m->prepared = 1;
730 __maybe_add_mapping(m);
731 spin_unlock_irqrestore(&pool->lock, flags);
732}
733
734static void overwrite_endio(struct bio *bio, int err)
735{
736 unsigned long flags;
737 struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
738 struct pool *pool = m->tc->pool;
739
740 m->err = err;
741
742 spin_lock_irqsave(&pool->lock, flags);
743 m->prepared = 1;
744 __maybe_add_mapping(m);
745 spin_unlock_irqrestore(&pool->lock, flags);
746}
747
748static void shared_read_endio(struct bio *bio, int err)
749{
750 struct list_head mappings;
751 struct new_mapping *m, *tmp;
752 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
753 unsigned long flags;
754 struct pool *pool = h->tc->pool;
755
756 bio->bi_end_io = h->saved_bi_end_io;
757 bio_endio(bio, err);
758
759 INIT_LIST_HEAD(&mappings);
760 ds_dec(h->entry, &mappings);
761
762 spin_lock_irqsave(&pool->lock, flags);
763 list_for_each_entry_safe(m, tmp, &mappings, list) {
764 list_del(&m->list);
765 INIT_LIST_HEAD(&m->list);
766 __maybe_add_mapping(m);
767 }
768 spin_unlock_irqrestore(&pool->lock, flags);
769
770 mempool_free(h, pool->endio_hook_pool);
771}
772
773/*----------------------------------------------------------------*/
774
775/*
776 * Workqueue.
777 */
778
779/*
780 * Prepared mapping jobs.
781 */
782
783/*
784 * This sends the bios in the cell back to the deferred_bios list.
785 */
786static void cell_defer(struct thin_c *tc, struct cell *cell,
787 dm_block_t data_block)
788{
789 struct pool *pool = tc->pool;
790 unsigned long flags;
791
792 spin_lock_irqsave(&pool->lock, flags);
793 cell_release(cell, &pool->deferred_bios);
794 spin_unlock_irqrestore(&tc->pool->lock, flags);
795
796 wake_worker(pool);
797}
798
799/*
800 * Same as cell_defer above, except it omits one particular detainee,
801 * a write bio that covers the block and has already been processed.
802 */
803static void cell_defer_except(struct thin_c *tc, struct cell *cell,
804 struct bio *exception)
805{
806 struct bio_list bios;
807 struct bio *bio;
808 struct pool *pool = tc->pool;
809 unsigned long flags;
810
811 bio_list_init(&bios);
812 cell_release(cell, &bios);
813
814 spin_lock_irqsave(&pool->lock, flags);
815 while ((bio = bio_list_pop(&bios)))
816 if (bio != exception)
817 bio_list_add(&pool->deferred_bios, bio);
818 spin_unlock_irqrestore(&pool->lock, flags);
819
820 wake_worker(pool);
821}
822
823static void process_prepared_mapping(struct new_mapping *m)
824{
825 struct thin_c *tc = m->tc;
826 struct bio *bio;
827 int r;
828
829 bio = m->bio;
830 if (bio)
831 bio->bi_end_io = m->saved_bi_end_io;
832
833 if (m->err) {
834 cell_error(m->cell);
835 return;
836 }
837
838 /*
839 * Commit the prepared block into the mapping btree.
840 * Any I/O for this block arriving after this point will get
841 * remapped to it directly.
842 */
843 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
844 if (r) {
845 DMERR("dm_thin_insert_block() failed");
846 cell_error(m->cell);
847 return;
848 }
849
850 /*
851 * Release any bios held while the block was being provisioned.
852 * If we are processing a write bio that completely covers the block,
853 * we already processed it so can ignore it now when processing
854 * the bios in the cell.
855 */
856 if (bio) {
857 cell_defer_except(tc, m->cell, bio);
858 bio_endio(bio, 0);
859 } else
860 cell_defer(tc, m->cell, m->data_block);
861
862 list_del(&m->list);
863 mempool_free(m, tc->pool->mapping_pool);
864}
865
866static void process_prepared_mappings(struct pool *pool)
867{
868 unsigned long flags;
869 struct list_head maps;
870 struct new_mapping *m, *tmp;
871
872 INIT_LIST_HEAD(&maps);
873 spin_lock_irqsave(&pool->lock, flags);
874 list_splice_init(&pool->prepared_mappings, &maps);
875 spin_unlock_irqrestore(&pool->lock, flags);
876
877 list_for_each_entry_safe(m, tmp, &maps, list)
878 process_prepared_mapping(m);
879}
880
881/*
882 * Deferred bio jobs.
883 */
884static int io_overwrites_block(struct pool *pool, struct bio *bio)
885{
886 return ((bio_data_dir(bio) == WRITE) &&
887 !(bio->bi_sector & pool->offset_mask)) &&
888 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
889}
890
891static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
892 bio_end_io_t *fn)
893{
894 *save = bio->bi_end_io;
895 bio->bi_end_io = fn;
896}
897
898static int ensure_next_mapping(struct pool *pool)
899{
900 if (pool->next_mapping)
901 return 0;
902
903 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
904
905 return pool->next_mapping ? 0 : -ENOMEM;
906}
907
908static struct new_mapping *get_next_mapping(struct pool *pool)
909{
910 struct new_mapping *r = pool->next_mapping;
911
912 BUG_ON(!pool->next_mapping);
913
914 pool->next_mapping = NULL;
915
916 return r;
917}
918
919static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
920 dm_block_t data_origin, dm_block_t data_dest,
921 struct cell *cell, struct bio *bio)
922{
923 int r;
924 struct pool *pool = tc->pool;
925 struct new_mapping *m = get_next_mapping(pool);
926
927 INIT_LIST_HEAD(&m->list);
928 m->prepared = 0;
929 m->tc = tc;
930 m->virt_block = virt_block;
931 m->data_block = data_dest;
932 m->cell = cell;
933 m->err = 0;
934 m->bio = NULL;
935
936 ds_add_work(&pool->ds, &m->list);
937
938 /*
939 * IO to pool_dev remaps to the pool target's data_dev.
940 *
941 * If the whole block of data is being overwritten, we can issue the
942 * bio immediately. Otherwise we use kcopyd to clone the data first.
943 */
944 if (io_overwrites_block(pool, bio)) {
945 m->bio = bio;
946 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
947 dm_get_mapinfo(bio)->ptr = m;
948 remap_and_issue(tc, bio, data_dest);
949 } else {
950 struct dm_io_region from, to;
951
952 from.bdev = tc->pool_dev->bdev;
953 from.sector = data_origin * pool->sectors_per_block;
954 from.count = pool->sectors_per_block;
955
956 to.bdev = tc->pool_dev->bdev;
957 to.sector = data_dest * pool->sectors_per_block;
958 to.count = pool->sectors_per_block;
959
960 r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
961 0, copy_complete, m);
962 if (r < 0) {
963 mempool_free(m, pool->mapping_pool);
964 DMERR("dm_kcopyd_copy() failed");
965 cell_error(cell);
966 }
967 }
968}
969
970static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
971 dm_block_t data_block, struct cell *cell,
972 struct bio *bio)
973{
974 struct pool *pool = tc->pool;
975 struct new_mapping *m = get_next_mapping(pool);
976
977 INIT_LIST_HEAD(&m->list);
978 m->prepared = 0;
979 m->tc = tc;
980 m->virt_block = virt_block;
981 m->data_block = data_block;
982 m->cell = cell;
983 m->err = 0;
984 m->bio = NULL;
985
986 /*
987 * If the whole block of data is being overwritten or we are not
988 * zeroing pre-existing data, we can issue the bio immediately.
989 * Otherwise we use kcopyd to zero the data first.
990 */
991 if (!pool->zero_new_blocks)
992 process_prepared_mapping(m);
993
994 else if (io_overwrites_block(pool, bio)) {
995 m->bio = bio;
996 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
997 dm_get_mapinfo(bio)->ptr = m;
998 remap_and_issue(tc, bio, data_block);
999
1000 } else {
1001 int r;
1002 struct dm_io_region to;
1003
1004 to.bdev = tc->pool_dev->bdev;
1005 to.sector = data_block * pool->sectors_per_block;
1006 to.count = pool->sectors_per_block;
1007
1008 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
1009 if (r < 0) {
1010 mempool_free(m, pool->mapping_pool);
1011 DMERR("dm_kcopyd_zero() failed");
1012 cell_error(cell);
1013 }
1014 }
1015}
1016
1017static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1018{
1019 int r;
1020 dm_block_t free_blocks;
1021 unsigned long flags;
1022 struct pool *pool = tc->pool;
1023
1024 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1025 if (r)
1026 return r;
1027
1028 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1029 DMWARN("%s: reached low water mark, sending event.",
1030 dm_device_name(pool->pool_md));
1031 spin_lock_irqsave(&pool->lock, flags);
1032 pool->low_water_triggered = 1;
1033 spin_unlock_irqrestore(&pool->lock, flags);
1034 dm_table_event(pool->ti->table);
1035 }
1036
1037 if (!free_blocks) {
1038 if (pool->no_free_space)
1039 return -ENOSPC;
1040 else {
1041 /*
1042 * Try to commit to see if that will free up some
1043 * more space.
1044 */
1045 r = dm_pool_commit_metadata(pool->pmd);
1046 if (r) {
1047 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1048 __func__, r);
1049 return r;
1050 }
1051
1052 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1053 if (r)
1054 return r;
1055
1056 /*
1057 * If we still have no space we set a flag to avoid
1058 * doing all this checking and return -ENOSPC.
1059 */
1060 if (!free_blocks) {
1061 DMWARN("%s: no free space available.",
1062 dm_device_name(pool->pool_md));
1063 spin_lock_irqsave(&pool->lock, flags);
1064 pool->no_free_space = 1;
1065 spin_unlock_irqrestore(&pool->lock, flags);
1066 return -ENOSPC;
1067 }
1068 }
1069 }
1070
1071 r = dm_pool_alloc_data_block(pool->pmd, result);
1072 if (r)
1073 return r;
1074
1075 return 0;
1076}
1077
1078/*
1079 * If we have run out of space, queue bios until the device is
1080 * resumed, presumably after having been reloaded with more space.
1081 */
1082static void retry_on_resume(struct bio *bio)
1083{
1084 struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
1085 struct pool *pool = tc->pool;
1086 unsigned long flags;
1087
1088 spin_lock_irqsave(&pool->lock, flags);
1089 bio_list_add(&pool->retry_on_resume_list, bio);
1090 spin_unlock_irqrestore(&pool->lock, flags);
1091}
1092
1093static void no_space(struct cell *cell)
1094{
1095 struct bio *bio;
1096 struct bio_list bios;
1097
1098 bio_list_init(&bios);
1099 cell_release(cell, &bios);
1100
1101 while ((bio = bio_list_pop(&bios)))
1102 retry_on_resume(bio);
1103}
1104
1105static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1106 struct cell_key *key,
1107 struct dm_thin_lookup_result *lookup_result,
1108 struct cell *cell)
1109{
1110 int r;
1111 dm_block_t data_block;
1112
1113 r = alloc_data_block(tc, &data_block);
1114 switch (r) {
1115 case 0:
1116 schedule_copy(tc, block, lookup_result->block,
1117 data_block, cell, bio);
1118 break;
1119
1120 case -ENOSPC:
1121 no_space(cell);
1122 break;
1123
1124 default:
1125 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1126 cell_error(cell);
1127 break;
1128 }
1129}
1130
1131static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1132 dm_block_t block,
1133 struct dm_thin_lookup_result *lookup_result)
1134{
1135 struct cell *cell;
1136 struct pool *pool = tc->pool;
1137 struct cell_key key;
1138
1139 /*
1140 * If cell is already occupied, then sharing is already in the process
1141 * of being broken so we have nothing further to do here.
1142 */
1143 build_data_key(tc->td, lookup_result->block, &key);
1144 if (bio_detain(pool->prison, &key, bio, &cell))
1145 return;
1146
1147 if (bio_data_dir(bio) == WRITE)
1148 break_sharing(tc, bio, block, &key, lookup_result, cell);
1149 else {
1150 struct endio_hook *h;
1151 h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1152
1153 h->tc = tc;
1154 h->entry = ds_inc(&pool->ds);
1155 save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
1156 dm_get_mapinfo(bio)->ptr = h;
1157
1158 cell_release_singleton(cell, bio);
1159 remap_and_issue(tc, bio, lookup_result->block);
1160 }
1161}
1162
1163static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1164 struct cell *cell)
1165{
1166 int r;
1167 dm_block_t data_block;
1168
1169 /*
1170 * Remap empty bios (flushes) immediately, without provisioning.
1171 */
1172 if (!bio->bi_size) {
1173 cell_release_singleton(cell, bio);
1174 remap_and_issue(tc, bio, 0);
1175 return;
1176 }
1177
1178 /*
1179 * Fill read bios with zeroes and complete them immediately.
1180 */
1181 if (bio_data_dir(bio) == READ) {
1182 zero_fill_bio(bio);
1183 cell_release_singleton(cell, bio);
1184 bio_endio(bio, 0);
1185 return;
1186 }
1187
1188 r = alloc_data_block(tc, &data_block);
1189 switch (r) {
1190 case 0:
1191 schedule_zero(tc, block, data_block, cell, bio);
1192 break;
1193
1194 case -ENOSPC:
1195 no_space(cell);
1196 break;
1197
1198 default:
1199 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1200 cell_error(cell);
1201 break;
1202 }
1203}
1204
1205static void process_bio(struct thin_c *tc, struct bio *bio)
1206{
1207 int r;
1208 dm_block_t block = get_bio_block(tc, bio);
1209 struct cell *cell;
1210 struct cell_key key;
1211 struct dm_thin_lookup_result lookup_result;
1212
1213 /*
1214 * If cell is already occupied, then the block is already
1215 * being provisioned so we have nothing further to do here.
1216 */
1217 build_virtual_key(tc->td, block, &key);
1218 if (bio_detain(tc->pool->prison, &key, bio, &cell))
1219 return;
1220
1221 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1222 switch (r) {
1223 case 0:
1224 /*
1225 * We can release this cell now. This thread is the only
1226 * one that puts bios into a cell, and we know there were
1227 * no preceding bios.
1228 */
1229 /*
1230 * TODO: this will probably have to change when discard goes
1231 * back in.
1232 */
1233 cell_release_singleton(cell, bio);
1234
1235 if (lookup_result.shared)
1236 process_shared_bio(tc, bio, block, &lookup_result);
1237 else
1238 remap_and_issue(tc, bio, lookup_result.block);
1239 break;
1240
1241 case -ENODATA:
1242 provision_block(tc, bio, block, cell);
1243 break;
1244
1245 default:
1246 DMERR("dm_thin_find_block() failed, error = %d", r);
1247 bio_io_error(bio);
1248 break;
1249 }
1250}
1251
1252static void process_deferred_bios(struct pool *pool)
1253{
1254 unsigned long flags;
1255 struct bio *bio;
1256 struct bio_list bios;
1257 int r;
1258
1259 bio_list_init(&bios);
1260
1261 spin_lock_irqsave(&pool->lock, flags);
1262 bio_list_merge(&bios, &pool->deferred_bios);
1263 bio_list_init(&pool->deferred_bios);
1264 spin_unlock_irqrestore(&pool->lock, flags);
1265
1266 while ((bio = bio_list_pop(&bios))) {
1267 struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
1268 /*
1269 * If we've got no free new_mapping structs, and processing
1270 * this bio might require one, we pause until there are some
1271 * prepared mappings to process.
1272 */
1273 if (ensure_next_mapping(pool)) {
1274 spin_lock_irqsave(&pool->lock, flags);
1275 bio_list_merge(&pool->deferred_bios, &bios);
1276 spin_unlock_irqrestore(&pool->lock, flags);
1277
1278 break;
1279 }
1280 process_bio(tc, bio);
1281 }
1282
1283 /*
1284 * If there are any deferred flush bios, we must commit
1285 * the metadata before issuing them.
1286 */
1287 bio_list_init(&bios);
1288 spin_lock_irqsave(&pool->lock, flags);
1289 bio_list_merge(&bios, &pool->deferred_flush_bios);
1290 bio_list_init(&pool->deferred_flush_bios);
1291 spin_unlock_irqrestore(&pool->lock, flags);
1292
1293 if (bio_list_empty(&bios))
1294 return;
1295
1296 r = dm_pool_commit_metadata(pool->pmd);
1297 if (r) {
1298 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1299 __func__, r);
1300 while ((bio = bio_list_pop(&bios)))
1301 bio_io_error(bio);
1302 return;
1303 }
1304
1305 while ((bio = bio_list_pop(&bios)))
1306 generic_make_request(bio);
1307}
1308
1309static void do_worker(struct work_struct *ws)
1310{
1311 struct pool *pool = container_of(ws, struct pool, worker);
1312
1313 process_prepared_mappings(pool);
1314 process_deferred_bios(pool);
1315}
1316
1317/*----------------------------------------------------------------*/
1318
1319/*
1320 * Mapping functions.
1321 */
1322
1323/*
1324 * Called only while mapping a thin bio to hand it over to the workqueue.
1325 */
1326static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1327{
1328 unsigned long flags;
1329 struct pool *pool = tc->pool;
1330
1331 spin_lock_irqsave(&pool->lock, flags);
1332 bio_list_add(&pool->deferred_bios, bio);
1333 spin_unlock_irqrestore(&pool->lock, flags);
1334
1335 wake_worker(pool);
1336}
1337
1338/*
1339 * Non-blocking function called from the thin target's map function.
1340 */
1341static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1342 union map_info *map_context)
1343{
1344 int r;
1345 struct thin_c *tc = ti->private;
1346 dm_block_t block = get_bio_block(tc, bio);
1347 struct dm_thin_device *td = tc->td;
1348 struct dm_thin_lookup_result result;
1349
1350 /*
1351 * Save the thin context for easy access from the deferred bio later.
1352 */
1353 map_context->ptr = tc;
1354
1355 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1356 thin_defer_bio(tc, bio);
1357 return DM_MAPIO_SUBMITTED;
1358 }
1359
1360 r = dm_thin_find_block(td, block, 0, &result);
1361
1362 /*
1363 * Note that we defer readahead too.
1364 */
1365 switch (r) {
1366 case 0:
1367 if (unlikely(result.shared)) {
1368 /*
1369 * We have a race condition here between the
1370 * result.shared value returned by the lookup and
1371 * snapshot creation, which may cause new
1372 * sharing.
1373 *
1374 * To avoid this always quiesce the origin before
1375 * taking the snap. You want to do this anyway to
1376 * ensure a consistent application view
1377 * (i.e. lockfs).
1378 *
1379 * More distant ancestors are irrelevant. The
1380 * shared flag will be set in their case.
1381 */
1382 thin_defer_bio(tc, bio);
1383 r = DM_MAPIO_SUBMITTED;
1384 } else {
1385 remap(tc, bio, result.block);
1386 r = DM_MAPIO_REMAPPED;
1387 }
1388 break;
1389
1390 case -ENODATA:
1391 /*
1392 * In future, the failed dm_thin_find_block above could
1393 * provide the hint to load the metadata into cache.
1394 */
1395 case -EWOULDBLOCK:
1396 thin_defer_bio(tc, bio);
1397 r = DM_MAPIO_SUBMITTED;
1398 break;
1399 }
1400
1401 return r;
1402}
1403
1404static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1405{
1406 int r;
1407 unsigned long flags;
1408 struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1409
1410 spin_lock_irqsave(&pt->pool->lock, flags);
1411 r = !bio_list_empty(&pt->pool->retry_on_resume_list);
1412 spin_unlock_irqrestore(&pt->pool->lock, flags);
1413
1414 if (!r) {
1415 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1416 r = bdi_congested(&q->backing_dev_info, bdi_bits);
1417 }
1418
1419 return r;
1420}
1421
1422static void __requeue_bios(struct pool *pool)
1423{
1424 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
1425 bio_list_init(&pool->retry_on_resume_list);
1426}
1427
1428/*----------------------------------------------------------------
1429 * Binding of control targets to a pool object
1430 *--------------------------------------------------------------*/
1431static int bind_control_target(struct pool *pool, struct dm_target *ti)
1432{
1433 struct pool_c *pt = ti->private;
1434
1435 pool->ti = ti;
1436 pool->low_water_blocks = pt->low_water_blocks;
1437 pool->zero_new_blocks = pt->zero_new_blocks;
1438
1439 return 0;
1440}
1441
1442static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1443{
1444 if (pool->ti == ti)
1445 pool->ti = NULL;
1446}
1447
1448/*----------------------------------------------------------------
1449 * Pool creation
1450 *--------------------------------------------------------------*/
1451static void __pool_destroy(struct pool *pool)
1452{
1453 __pool_table_remove(pool);
1454
1455 if (dm_pool_metadata_close(pool->pmd) < 0)
1456 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1457
1458 prison_destroy(pool->prison);
1459 dm_kcopyd_client_destroy(pool->copier);
1460
1461 if (pool->wq)
1462 destroy_workqueue(pool->wq);
1463
1464 if (pool->next_mapping)
1465 mempool_free(pool->next_mapping, pool->mapping_pool);
1466 mempool_destroy(pool->mapping_pool);
1467 mempool_destroy(pool->endio_hook_pool);
1468 kfree(pool);
1469}
1470
1471static struct pool *pool_create(struct mapped_device *pool_md,
1472 struct block_device *metadata_dev,
1473 unsigned long block_size, char **error)
1474{
1475 int r;
1476 void *err_p;
1477 struct pool *pool;
1478 struct dm_pool_metadata *pmd;
1479
1480 pmd = dm_pool_metadata_open(metadata_dev, block_size);
1481 if (IS_ERR(pmd)) {
1482 *error = "Error creating metadata object";
1483 return (struct pool *)pmd;
1484 }
1485
1486 pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1487 if (!pool) {
1488 *error = "Error allocating memory for pool";
1489 err_p = ERR_PTR(-ENOMEM);
1490 goto bad_pool;
1491 }
1492
1493 pool->pmd = pmd;
1494 pool->sectors_per_block = block_size;
1495 pool->block_shift = ffs(block_size) - 1;
1496 pool->offset_mask = block_size - 1;
1497 pool->low_water_blocks = 0;
1498 pool->zero_new_blocks = 1;
1499 pool->prison = prison_create(PRISON_CELLS);
1500 if (!pool->prison) {
1501 *error = "Error creating pool's bio prison";
1502 err_p = ERR_PTR(-ENOMEM);
1503 goto bad_prison;
1504 }
1505
1506 pool->copier = dm_kcopyd_client_create();
1507 if (IS_ERR(pool->copier)) {
1508 r = PTR_ERR(pool->copier);
1509 *error = "Error creating pool's kcopyd client";
1510 err_p = ERR_PTR(r);
1511 goto bad_kcopyd_client;
1512 }
1513
1514 /*
1515 * Create singlethreaded workqueue that will service all devices
1516 * that use this metadata.
1517 */
1518 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1519 if (!pool->wq) {
1520 *error = "Error creating pool's workqueue";
1521 err_p = ERR_PTR(-ENOMEM);
1522 goto bad_wq;
1523 }
1524
1525 INIT_WORK(&pool->worker, do_worker);
1526 spin_lock_init(&pool->lock);
1527 bio_list_init(&pool->deferred_bios);
1528 bio_list_init(&pool->deferred_flush_bios);
1529 INIT_LIST_HEAD(&pool->prepared_mappings);
1530 pool->low_water_triggered = 0;
1531 pool->no_free_space = 0;
1532 bio_list_init(&pool->retry_on_resume_list);
1533 ds_init(&pool->ds);
1534
1535 pool->next_mapping = NULL;
1536 pool->mapping_pool =
1537 mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping));
1538 if (!pool->mapping_pool) {
1539 *error = "Error creating pool's mapping mempool";
1540 err_p = ERR_PTR(-ENOMEM);
1541 goto bad_mapping_pool;
1542 }
1543
1544 pool->endio_hook_pool =
1545 mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook));
1546 if (!pool->endio_hook_pool) {
1547 *error = "Error creating pool's endio_hook mempool";
1548 err_p = ERR_PTR(-ENOMEM);
1549 goto bad_endio_hook_pool;
1550 }
1551 pool->ref_count = 1;
1552 pool->pool_md = pool_md;
1553 pool->md_dev = metadata_dev;
1554 __pool_table_insert(pool);
1555
1556 return pool;
1557
1558bad_endio_hook_pool:
1559 mempool_destroy(pool->mapping_pool);
1560bad_mapping_pool:
1561 destroy_workqueue(pool->wq);
1562bad_wq:
1563 dm_kcopyd_client_destroy(pool->copier);
1564bad_kcopyd_client:
1565 prison_destroy(pool->prison);
1566bad_prison:
1567 kfree(pool);
1568bad_pool:
1569 if (dm_pool_metadata_close(pmd))
1570 DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1571
1572 return err_p;
1573}
1574
1575static void __pool_inc(struct pool *pool)
1576{
1577 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1578 pool->ref_count++;
1579}
1580
1581static void __pool_dec(struct pool *pool)
1582{
1583 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1584 BUG_ON(!pool->ref_count);
1585 if (!--pool->ref_count)
1586 __pool_destroy(pool);
1587}
1588
1589static struct pool *__pool_find(struct mapped_device *pool_md,
1590 struct block_device *metadata_dev,
1591 unsigned long block_size, char **error)
1592{
1593 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1594
1595 if (pool) {
1596 if (pool->pool_md != pool_md)
1597 return ERR_PTR(-EBUSY);
1598 __pool_inc(pool);
1599
1600 } else {
1601 pool = __pool_table_lookup(pool_md);
1602 if (pool) {
1603 if (pool->md_dev != metadata_dev)
1604 return ERR_PTR(-EINVAL);
1605 __pool_inc(pool);
1606
1607 } else
1608 pool = pool_create(pool_md, metadata_dev, block_size, error);
1609 }
1610
1611 return pool;
1612}
1613
1614/*----------------------------------------------------------------
1615 * Pool target methods
1616 *--------------------------------------------------------------*/
1617static void pool_dtr(struct dm_target *ti)
1618{
1619 struct pool_c *pt = ti->private;
1620
1621 mutex_lock(&dm_thin_pool_table.mutex);
1622
1623 unbind_control_target(pt->pool, ti);
1624 __pool_dec(pt->pool);
1625 dm_put_device(ti, pt->metadata_dev);
1626 dm_put_device(ti, pt->data_dev);
1627 kfree(pt);
1628
1629 mutex_unlock(&dm_thin_pool_table.mutex);
1630}
1631
1632struct pool_features {
1633 unsigned zero_new_blocks:1;
1634};
1635
1636static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1637 struct dm_target *ti)
1638{
1639 int r;
1640 unsigned argc;
1641 const char *arg_name;
1642
1643 static struct dm_arg _args[] = {
1644 {0, 1, "Invalid number of pool feature arguments"},
1645 };
1646
1647 /*
1648 * No feature arguments supplied.
1649 */
1650 if (!as->argc)
1651 return 0;
1652
1653 r = dm_read_arg_group(_args, as, &argc, &ti->error);
1654 if (r)
1655 return -EINVAL;
1656
1657 while (argc && !r) {
1658 arg_name = dm_shift_arg(as);
1659 argc--;
1660
1661 if (!strcasecmp(arg_name, "skip_block_zeroing")) {
1662 pf->zero_new_blocks = 0;
1663 continue;
1664 }
1665
1666 ti->error = "Unrecognised pool feature requested";
1667 r = -EINVAL;
1668 }
1669
1670 return r;
1671}
1672
1673/*
1674 * thin-pool <metadata dev> <data dev>
1675 * <data block size (sectors)>
1676 * <low water mark (blocks)>
1677 * [<#feature args> [<arg>]*]
1678 *
1679 * Optional feature arguments are:
1680 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1681 */
1682static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1683{
1684 int r;
1685 struct pool_c *pt;
1686 struct pool *pool;
1687 struct pool_features pf;
1688 struct dm_arg_set as;
1689 struct dm_dev *data_dev;
1690 unsigned long block_size;
1691 dm_block_t low_water_blocks;
1692 struct dm_dev *metadata_dev;
1693 sector_t metadata_dev_size;
1694
1695 /*
1696 * FIXME Remove validation from scope of lock.
1697 */
1698 mutex_lock(&dm_thin_pool_table.mutex);
1699
1700 if (argc < 4) {
1701 ti->error = "Invalid argument count";
1702 r = -EINVAL;
1703 goto out_unlock;
1704 }
1705 as.argc = argc;
1706 as.argv = argv;
1707
1708 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
1709 if (r) {
1710 ti->error = "Error opening metadata block device";
1711 goto out_unlock;
1712 }
1713
1714 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1715 if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) {
1716 ti->error = "Metadata device is too large";
1717 r = -EINVAL;
1718 goto out_metadata;
1719 }
1720
1721 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1722 if (r) {
1723 ti->error = "Error getting data device";
1724 goto out_metadata;
1725 }
1726
1727 if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1728 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1729 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1730 !is_power_of_2(block_size)) {
1731 ti->error = "Invalid block size";
1732 r = -EINVAL;
1733 goto out;
1734 }
1735
1736 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
1737 ti->error = "Invalid low water mark";
1738 r = -EINVAL;
1739 goto out;
1740 }
1741
1742 /*
1743 * Set default pool features.
1744 */
1745 memset(&pf, 0, sizeof(pf));
1746 pf.zero_new_blocks = 1;
1747
1748 dm_consume_args(&as, 4);
1749 r = parse_pool_features(&as, &pf, ti);
1750 if (r)
1751 goto out;
1752
1753 pt = kzalloc(sizeof(*pt), GFP_KERNEL);
1754 if (!pt) {
1755 r = -ENOMEM;
1756 goto out;
1757 }
1758
1759 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1760 block_size, &ti->error);
1761 if (IS_ERR(pool)) {
1762 r = PTR_ERR(pool);
1763 goto out_free_pt;
1764 }
1765
1766 pt->pool = pool;
1767 pt->ti = ti;
1768 pt->metadata_dev = metadata_dev;
1769 pt->data_dev = data_dev;
1770 pt->low_water_blocks = low_water_blocks;
1771 pt->zero_new_blocks = pf.zero_new_blocks;
1772 ti->num_flush_requests = 1;
1773 ti->num_discard_requests = 0;
1774 ti->private = pt;
1775
1776 pt->callbacks.congested_fn = pool_is_congested;
1777 dm_table_add_target_callbacks(ti->table, &pt->callbacks);
1778
1779 mutex_unlock(&dm_thin_pool_table.mutex);
1780
1781 return 0;
1782
1783out_free_pt:
1784 kfree(pt);
1785out:
1786 dm_put_device(ti, data_dev);
1787out_metadata:
1788 dm_put_device(ti, metadata_dev);
1789out_unlock:
1790 mutex_unlock(&dm_thin_pool_table.mutex);
1791
1792 return r;
1793}
1794
1795static int pool_map(struct dm_target *ti, struct bio *bio,
1796 union map_info *map_context)
1797{
1798 int r;
1799 struct pool_c *pt = ti->private;
1800 struct pool *pool = pt->pool;
1801 unsigned long flags;
1802
1803 /*
1804 * As this is a singleton target, ti->begin is always zero.
1805 */
1806 spin_lock_irqsave(&pool->lock, flags);
1807 bio->bi_bdev = pt->data_dev->bdev;
1808 r = DM_MAPIO_REMAPPED;
1809 spin_unlock_irqrestore(&pool->lock, flags);
1810
1811 return r;
1812}
1813
1814/*
1815 * Retrieves the number of blocks of the data device from
1816 * the superblock and compares it to the actual device size,
1817 * thus resizing the data device in case it has grown.
1818 *
1819 * This both copes with opening preallocated data devices in the ctr
1820 * being followed by a resume
1821 * -and-
1822 * calling the resume method individually after userspace has
1823 * grown the data device in reaction to a table event.
1824 */
1825static int pool_preresume(struct dm_target *ti)
1826{
1827 int r;
1828 struct pool_c *pt = ti->private;
1829 struct pool *pool = pt->pool;
1830 dm_block_t data_size, sb_data_size;
1831
1832 /*
1833 * Take control of the pool object.
1834 */
1835 r = bind_control_target(pool, ti);
1836 if (r)
1837 return r;
1838
1839 data_size = ti->len >> pool->block_shift;
1840 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
1841 if (r) {
1842 DMERR("failed to retrieve data device size");
1843 return r;
1844 }
1845
1846 if (data_size < sb_data_size) {
1847 DMERR("pool target too small, is %llu blocks (expected %llu)",
1848 data_size, sb_data_size);
1849 return -EINVAL;
1850
1851 } else if (data_size > sb_data_size) {
1852 r = dm_pool_resize_data_dev(pool->pmd, data_size);
1853 if (r) {
1854 DMERR("failed to resize data device");
1855 return r;
1856 }
1857
1858 r = dm_pool_commit_metadata(pool->pmd);
1859 if (r) {
1860 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1861 __func__, r);
1862 return r;
1863 }
1864 }
1865
1866 return 0;
1867}
1868
1869static void pool_resume(struct dm_target *ti)
1870{
1871 struct pool_c *pt = ti->private;
1872 struct pool *pool = pt->pool;
1873 unsigned long flags;
1874
1875 spin_lock_irqsave(&pool->lock, flags);
1876 pool->low_water_triggered = 0;
1877 pool->no_free_space = 0;
1878 __requeue_bios(pool);
1879 spin_unlock_irqrestore(&pool->lock, flags);
1880
1881 wake_worker(pool);
1882}
1883
1884static void pool_postsuspend(struct dm_target *ti)
1885{
1886 int r;
1887 struct pool_c *pt = ti->private;
1888 struct pool *pool = pt->pool;
1889
1890 flush_workqueue(pool->wq);
1891
1892 r = dm_pool_commit_metadata(pool->pmd);
1893 if (r < 0) {
1894 DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
1895 __func__, r);
1896 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
1897 }
1898}
1899
1900static int check_arg_count(unsigned argc, unsigned args_required)
1901{
1902 if (argc != args_required) {
1903 DMWARN("Message received with %u arguments instead of %u.",
1904 argc, args_required);
1905 return -EINVAL;
1906 }
1907
1908 return 0;
1909}
1910
1911static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
1912{
1913 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
1914 *dev_id <= MAX_DEV_ID)
1915 return 0;
1916
1917 if (warning)
1918 DMWARN("Message received with invalid device id: %s", arg);
1919
1920 return -EINVAL;
1921}
1922
1923static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
1924{
1925 dm_thin_id dev_id;
1926 int r;
1927
1928 r = check_arg_count(argc, 2);
1929 if (r)
1930 return r;
1931
1932 r = read_dev_id(argv[1], &dev_id, 1);
1933 if (r)
1934 return r;
1935
1936 r = dm_pool_create_thin(pool->pmd, dev_id);
1937 if (r) {
1938 DMWARN("Creation of new thinly-provisioned device with id %s failed.",
1939 argv[1]);
1940 return r;
1941 }
1942
1943 return 0;
1944}
1945
1946static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
1947{
1948 dm_thin_id dev_id;
1949 dm_thin_id origin_dev_id;
1950 int r;
1951
1952 r = check_arg_count(argc, 3);
1953 if (r)
1954 return r;
1955
1956 r = read_dev_id(argv[1], &dev_id, 1);
1957 if (r)
1958 return r;
1959
1960 r = read_dev_id(argv[2], &origin_dev_id, 1);
1961 if (r)
1962 return r;
1963
1964 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
1965 if (r) {
1966 DMWARN("Creation of new snapshot %s of device %s failed.",
1967 argv[1], argv[2]);
1968 return r;
1969 }
1970
1971 return 0;
1972}
1973
1974static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
1975{
1976 dm_thin_id dev_id;
1977 int r;
1978
1979 r = check_arg_count(argc, 2);
1980 if (r)
1981 return r;
1982
1983 r = read_dev_id(argv[1], &dev_id, 1);
1984 if (r)
1985 return r;
1986
1987 r = dm_pool_delete_thin_device(pool->pmd, dev_id);
1988 if (r)
1989 DMWARN("Deletion of thin device %s failed.", argv[1]);
1990
1991 return r;
1992}
1993
1994static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
1995{
1996 dm_thin_id old_id, new_id;
1997 int r;
1998
1999 r = check_arg_count(argc, 3);
2000 if (r)
2001 return r;
2002
2003 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2004 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2005 return -EINVAL;
2006 }
2007
2008 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2009 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2010 return -EINVAL;
2011 }
2012
2013 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2014 if (r) {
2015 DMWARN("Failed to change transaction id from %s to %s.",
2016 argv[1], argv[2]);
2017 return r;
2018 }
2019
2020 return 0;
2021}
2022
2023/*
2024 * Messages supported:
2025 * create_thin <dev_id>
2026 * create_snap <dev_id> <origin_id>
2027 * delete <dev_id>
2028 * trim <dev_id> <new_size_in_sectors>
2029 * set_transaction_id <current_trans_id> <new_trans_id>
2030 */
2031static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2032{
2033 int r = -EINVAL;
2034 struct pool_c *pt = ti->private;
2035 struct pool *pool = pt->pool;
2036
2037 if (!strcasecmp(argv[0], "create_thin"))
2038 r = process_create_thin_mesg(argc, argv, pool);
2039
2040 else if (!strcasecmp(argv[0], "create_snap"))
2041 r = process_create_snap_mesg(argc, argv, pool);
2042
2043 else if (!strcasecmp(argv[0], "delete"))
2044 r = process_delete_mesg(argc, argv, pool);
2045
2046 else if (!strcasecmp(argv[0], "set_transaction_id"))
2047 r = process_set_transaction_id_mesg(argc, argv, pool);
2048
2049 else
2050 DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2051
2052 if (!r) {
2053 r = dm_pool_commit_metadata(pool->pmd);
2054 if (r)
2055 DMERR("%s message: dm_pool_commit_metadata() failed, error = %d",
2056 argv[0], r);
2057 }
2058
2059 return r;
2060}
2061
2062/*
2063 * Status line is:
2064 * <transaction id> <used metadata sectors>/<total metadata sectors>
2065 * <used data sectors>/<total data sectors> <held metadata root>
2066 */
2067static int pool_status(struct dm_target *ti, status_type_t type,
2068 char *result, unsigned maxlen)
2069{
2070 int r;
2071 unsigned sz = 0;
2072 uint64_t transaction_id;
2073 dm_block_t nr_free_blocks_data;
2074 dm_block_t nr_free_blocks_metadata;
2075 dm_block_t nr_blocks_data;
2076 dm_block_t nr_blocks_metadata;
2077 dm_block_t held_root;
2078 char buf[BDEVNAME_SIZE];
2079 char buf2[BDEVNAME_SIZE];
2080 struct pool_c *pt = ti->private;
2081 struct pool *pool = pt->pool;
2082
2083 switch (type) {
2084 case STATUSTYPE_INFO:
2085 r = dm_pool_get_metadata_transaction_id(pool->pmd,
2086 &transaction_id);
2087 if (r)
2088 return r;
2089
2090 r = dm_pool_get_free_metadata_block_count(pool->pmd,
2091 &nr_free_blocks_metadata);
2092 if (r)
2093 return r;
2094
2095 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2096 if (r)
2097 return r;
2098
2099 r = dm_pool_get_free_block_count(pool->pmd,
2100 &nr_free_blocks_data);
2101 if (r)
2102 return r;
2103
2104 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2105 if (r)
2106 return r;
2107
2108 r = dm_pool_get_held_metadata_root(pool->pmd, &held_root);
2109 if (r)
2110 return r;
2111
2112 DMEMIT("%llu %llu/%llu %llu/%llu ",
2113 (unsigned long long)transaction_id,
2114 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2115 (unsigned long long)nr_blocks_metadata,
2116 (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2117 (unsigned long long)nr_blocks_data);
2118
2119 if (held_root)
2120 DMEMIT("%llu", held_root);
2121 else
2122 DMEMIT("-");
2123
2124 break;
2125
2126 case STATUSTYPE_TABLE:
2127 DMEMIT("%s %s %lu %llu ",
2128 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
2129 format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2130 (unsigned long)pool->sectors_per_block,
2131 (unsigned long long)pt->low_water_blocks);
2132
2133 DMEMIT("%u ", !pool->zero_new_blocks);
2134
2135 if (!pool->zero_new_blocks)
2136 DMEMIT("skip_block_zeroing ");
2137 break;
2138 }
2139
2140 return 0;
2141}
2142
2143static int pool_iterate_devices(struct dm_target *ti,
2144 iterate_devices_callout_fn fn, void *data)
2145{
2146 struct pool_c *pt = ti->private;
2147
2148 return fn(ti, pt->data_dev, 0, ti->len, data);
2149}
2150
2151static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2152 struct bio_vec *biovec, int max_size)
2153{
2154 struct pool_c *pt = ti->private;
2155 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2156
2157 if (!q->merge_bvec_fn)
2158 return max_size;
2159
2160 bvm->bi_bdev = pt->data_dev->bdev;
2161
2162 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2163}
2164
2165static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2166{
2167 struct pool_c *pt = ti->private;
2168 struct pool *pool = pt->pool;
2169
2170 blk_limits_io_min(limits, 0);
2171 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2172}
2173
2174static struct target_type pool_target = {
2175 .name = "thin-pool",
2176 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2177 DM_TARGET_IMMUTABLE,
2178 .version = {1, 0, 0},
2179 .module = THIS_MODULE,
2180 .ctr = pool_ctr,
2181 .dtr = pool_dtr,
2182 .map = pool_map,
2183 .postsuspend = pool_postsuspend,
2184 .preresume = pool_preresume,
2185 .resume = pool_resume,
2186 .message = pool_message,
2187 .status = pool_status,
2188 .merge = pool_merge,
2189 .iterate_devices = pool_iterate_devices,
2190 .io_hints = pool_io_hints,
2191};
2192
2193/*----------------------------------------------------------------
2194 * Thin target methods
2195 *--------------------------------------------------------------*/
2196static void thin_dtr(struct dm_target *ti)
2197{
2198 struct thin_c *tc = ti->private;
2199
2200 mutex_lock(&dm_thin_pool_table.mutex);
2201
2202 __pool_dec(tc->pool);
2203 dm_pool_close_thin_device(tc->td);
2204 dm_put_device(ti, tc->pool_dev);
2205 kfree(tc);
2206
2207 mutex_unlock(&dm_thin_pool_table.mutex);
2208}
2209
2210/*
2211 * Thin target parameters:
2212 *
2213 * <pool_dev> <dev_id>
2214 *
2215 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2216 * dev_id: the internal device identifier
2217 */
2218static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2219{
2220 int r;
2221 struct thin_c *tc;
2222 struct dm_dev *pool_dev;
2223 struct mapped_device *pool_md;
2224
2225 mutex_lock(&dm_thin_pool_table.mutex);
2226
2227 if (argc != 2) {
2228 ti->error = "Invalid argument count";
2229 r = -EINVAL;
2230 goto out_unlock;
2231 }
2232
2233 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2234 if (!tc) {
2235 ti->error = "Out of memory";
2236 r = -ENOMEM;
2237 goto out_unlock;
2238 }
2239
2240 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2241 if (r) {
2242 ti->error = "Error opening pool device";
2243 goto bad_pool_dev;
2244 }
2245 tc->pool_dev = pool_dev;
2246
2247 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2248 ti->error = "Invalid device id";
2249 r = -EINVAL;
2250 goto bad_common;
2251 }
2252
2253 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2254 if (!pool_md) {
2255 ti->error = "Couldn't get pool mapped device";
2256 r = -EINVAL;
2257 goto bad_common;
2258 }
2259
2260 tc->pool = __pool_table_lookup(pool_md);
2261 if (!tc->pool) {
2262 ti->error = "Couldn't find pool object";
2263 r = -EINVAL;
2264 goto bad_pool_lookup;
2265 }
2266 __pool_inc(tc->pool);
2267
2268 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2269 if (r) {
2270 ti->error = "Couldn't open thin internal device";
2271 goto bad_thin_open;
2272 }
2273
2274 ti->split_io = tc->pool->sectors_per_block;
2275 ti->num_flush_requests = 1;
2276 ti->num_discard_requests = 0;
2277 ti->discards_supported = 0;
2278
2279 dm_put(pool_md);
2280
2281 mutex_unlock(&dm_thin_pool_table.mutex);
2282
2283 return 0;
2284
2285bad_thin_open:
2286 __pool_dec(tc->pool);
2287bad_pool_lookup:
2288 dm_put(pool_md);
2289bad_common:
2290 dm_put_device(ti, tc->pool_dev);
2291bad_pool_dev:
2292 kfree(tc);
2293out_unlock:
2294 mutex_unlock(&dm_thin_pool_table.mutex);
2295
2296 return r;
2297}
2298
2299static int thin_map(struct dm_target *ti, struct bio *bio,
2300 union map_info *map_context)
2301{
2302 bio->bi_sector -= ti->begin;
2303
2304 return thin_bio_map(ti, bio, map_context);
2305}
2306
2307static void thin_postsuspend(struct dm_target *ti)
2308{
2309 if (dm_noflush_suspending(ti))
2310 requeue_io((struct thin_c *)ti->private);
2311}
2312
2313/*
2314 * <nr mapped sectors> <highest mapped sector>
2315 */
2316static int thin_status(struct dm_target *ti, status_type_t type,
2317 char *result, unsigned maxlen)
2318{
2319 int r;
2320 ssize_t sz = 0;
2321 dm_block_t mapped, highest;
2322 char buf[BDEVNAME_SIZE];
2323 struct thin_c *tc = ti->private;
2324
2325 if (!tc->td)
2326 DMEMIT("-");
2327 else {
2328 switch (type) {
2329 case STATUSTYPE_INFO:
2330 r = dm_thin_get_mapped_count(tc->td, &mapped);
2331 if (r)
2332 return r;
2333
2334 r = dm_thin_get_highest_mapped_block(tc->td, &highest);
2335 if (r < 0)
2336 return r;
2337
2338 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
2339 if (r)
2340 DMEMIT("%llu", ((highest + 1) *
2341 tc->pool->sectors_per_block) - 1);
2342 else
2343 DMEMIT("-");
2344 break;
2345
2346 case STATUSTYPE_TABLE:
2347 DMEMIT("%s %lu",
2348 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2349 (unsigned long) tc->dev_id);
2350 break;
2351 }
2352 }
2353
2354 return 0;
2355}
2356
2357static int thin_iterate_devices(struct dm_target *ti,
2358 iterate_devices_callout_fn fn, void *data)
2359{
2360 dm_block_t blocks;
2361 struct thin_c *tc = ti->private;
2362
2363 /*
2364 * We can't call dm_pool_get_data_dev_size() since that blocks. So
2365 * we follow a more convoluted path through to the pool's target.
2366 */
2367 if (!tc->pool->ti)
2368 return 0; /* nothing is bound */
2369
2370 blocks = tc->pool->ti->len >> tc->pool->block_shift;
2371 if (blocks)
2372 return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data);
2373
2374 return 0;
2375}
2376
2377static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2378{
2379 struct thin_c *tc = ti->private;
2380
2381 blk_limits_io_min(limits, 0);
2382 blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
2383}
2384
2385static struct target_type thin_target = {
2386 .name = "thin",
2387 .version = {1, 0, 0},
2388 .module = THIS_MODULE,
2389 .ctr = thin_ctr,
2390 .dtr = thin_dtr,
2391 .map = thin_map,
2392 .postsuspend = thin_postsuspend,
2393 .status = thin_status,
2394 .iterate_devices = thin_iterate_devices,
2395 .io_hints = thin_io_hints,
2396};
2397
2398/*----------------------------------------------------------------*/
2399
2400static int __init dm_thin_init(void)
2401{
2402 int r;
2403
2404 pool_table_init();
2405
2406 r = dm_register_target(&thin_target);
2407 if (r)
2408 return r;
2409
2410 r = dm_register_target(&pool_target);
2411 if (r)
2412 dm_unregister_target(&thin_target);
2413
2414 return r;
2415}
2416
2417static void dm_thin_exit(void)
2418{
2419 dm_unregister_target(&thin_target);
2420 dm_unregister_target(&pool_target);
2421}
2422
2423module_init(dm_thin_init);
2424module_exit(dm_thin_exit);
2425
2426MODULE_DESCRIPTION(DM_NAME "device-mapper thin provisioning target");
2427MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2428MODULE_LICENSE("GPL");