diff options
-rw-r--r-- | Documentation/device-mapper/switch.txt | 126 | ||||
-rw-r--r-- | drivers/md/Kconfig | 14 | ||||
-rw-r--r-- | drivers/md/Makefile | 1 | ||||
-rw-r--r-- | drivers/md/dm-switch.c | 538 |
4 files changed, 679 insertions, 0 deletions
diff --git a/Documentation/device-mapper/switch.txt b/Documentation/device-mapper/switch.txt new file mode 100644 index 000000000000..2fa749387be8 --- /dev/null +++ b/Documentation/device-mapper/switch.txt | |||
@@ -0,0 +1,126 @@ | |||
1 | dm-switch | ||
2 | ========= | ||
3 | |||
4 | The device-mapper switch target creates a device that supports an | ||
5 | arbitrary mapping of fixed-size regions of I/O across a fixed set of | ||
6 | paths. The path used for any specific region can be switched | ||
7 | dynamically by sending the target a message. | ||
8 | |||
9 | It maps I/O to underlying block devices efficiently when there is a large | ||
10 | number of fixed-sized address regions but there is no simple pattern | ||
11 | that would allow for a compact representation of the mapping such as | ||
12 | dm-stripe. | ||
13 | |||
14 | Background | ||
15 | ---------- | ||
16 | |||
17 | Dell EqualLogic and some other iSCSI storage arrays use a distributed | ||
18 | frameless architecture. In this architecture, the storage group | ||
19 | consists of a number of distinct storage arrays ("members") each having | ||
20 | independent controllers, disk storage and network adapters. When a LUN | ||
21 | is created it is spread across multiple members. The details of the | ||
22 | spreading are hidden from initiators connected to this storage system. | ||
23 | The storage group exposes a single target discovery portal, no matter | ||
24 | how many members are being used. When iSCSI sessions are created, each | ||
25 | session is connected to an eth port on a single member. Data to a LUN | ||
26 | can be sent on any iSCSI session, and if the blocks being accessed are | ||
27 | stored on another member the I/O will be forwarded as required. This | ||
28 | forwarding is invisible to the initiator. The storage layout is also | ||
29 | dynamic, and the blocks stored on disk may be moved from member to | ||
30 | member as needed to balance the load. | ||
31 | |||
32 | This architecture simplifies the management and configuration of both | ||
33 | the storage group and initiators. In a multipathing configuration, it | ||
34 | is possible to set up multiple iSCSI sessions to use multiple network | ||
35 | interfaces on both the host and target to take advantage of the | ||
36 | increased network bandwidth. An initiator could use a simple round | ||
37 | robin algorithm to send I/O across all paths and let the storage array | ||
38 | members forward it as necessary, but there is a performance advantage to | ||
39 | sending data directly to the correct member. | ||
40 | |||
41 | A device-mapper table already lets you map different regions of a | ||
42 | device onto different targets. However in this architecture the LUN is | ||
43 | spread with an address region size on the order of 10s of MBs, which | ||
44 | means the resulting table could have more than a million entries and | ||
45 | consume far too much memory. | ||
46 | |||
47 | Using this device-mapper switch target we can now build a two-layer | ||
48 | device hierarchy: | ||
49 | |||
50 | Upper Tier – Determine which array member the I/O should be sent to. | ||
51 | Lower Tier – Load balance amongst paths to a particular member. | ||
52 | |||
53 | The lower tier consists of a single dm multipath device for each member. | ||
54 | Each of these multipath devices contains the set of paths directly to | ||
55 | the array member in one priority group, and leverages existing path | ||
56 | selectors to load balance amongst these paths. We also build a | ||
57 | non-preferred priority group containing paths to other array members for | ||
58 | failover reasons. | ||
59 | |||
60 | The upper tier consists of a single dm-switch device. This device uses | ||
61 | a bitmap to look up the location of the I/O and choose the appropriate | ||
62 | lower tier device to route the I/O. By using a bitmap we are able to | ||
63 | use 4 bits for each address range in a 16 member group (which is very | ||
64 | large for us). This is a much denser representation than the dm table | ||
65 | b-tree can achieve. | ||
66 | |||
67 | Construction Parameters | ||
68 | ======================= | ||
69 | |||
70 | <num_paths> <region_size> <num_optional_args> [<optional_args>...] | ||
71 | [<dev_path> <offset>]+ | ||
72 | |||
73 | <num_paths> | ||
74 | The number of paths across which to distribute the I/O. | ||
75 | |||
76 | <region_size> | ||
77 | The number of 512-byte sectors in a region. Each region can be redirected | ||
78 | to any of the available paths. | ||
79 | |||
80 | <num_optional_args> | ||
81 | The number of optional arguments. Currently, no optional arguments | ||
82 | are supported and so this must be zero. | ||
83 | |||
84 | <dev_path> | ||
85 | The block device that represents a specific path to the device. | ||
86 | |||
87 | <offset> | ||
88 | The offset of the start of data on the specific <dev_path> (in units | ||
89 | of 512-byte sectors). This number is added to the sector number when | ||
90 | forwarding the request to the specific path. Typically it is zero. | ||
91 | |||
92 | Messages | ||
93 | ======== | ||
94 | |||
95 | set_region_mappings <index>:<path_nr> [<index>]:<path_nr> [<index>]:<path_nr>... | ||
96 | |||
97 | Modify the region table by specifying which regions are redirected to | ||
98 | which paths. | ||
99 | |||
100 | <index> | ||
101 | The region number (region size was specified in constructor parameters). | ||
102 | If index is omitted, the next region (previous index + 1) is used. | ||
103 | Expressed in hexadecimal (WITHOUT any prefix like 0x). | ||
104 | |||
105 | <path_nr> | ||
106 | The path number in the range 0 ... (<num_paths> - 1). | ||
107 | Expressed in hexadecimal (WITHOUT any prefix like 0x). | ||
108 | |||
109 | Status | ||
110 | ====== | ||
111 | |||
112 | No status line is reported. | ||
113 | |||
114 | Example | ||
115 | ======= | ||
116 | |||
117 | Assume that you have volumes vg1/switch0 vg1/switch1 vg1/switch2 with | ||
118 | the same size. | ||
119 | |||
120 | Create a switch device with 64kB region size: | ||
121 | dmsetup create switch --table "0 `blockdev --getsize /dev/vg1/switch0` | ||
122 | switch 3 128 0 /dev/vg1/switch0 0 /dev/vg1/switch1 0 /dev/vg1/switch2 0" | ||
123 | |||
124 | Set mappings for the first 7 entries to point to devices switch0, switch1, | ||
125 | switch2, switch0, switch1, switch2, switch1: | ||
126 | dmsetup message switch 0 set_region_mappings 0:0 :1 :2 :0 :1 :2 :1 | ||
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 3bfc8f1da9fe..30b426ed744b 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -412,4 +412,18 @@ config DM_VERITY | |||
412 | 412 | ||
413 | If unsure, say N. | 413 | If unsure, say N. |
414 | 414 | ||
415 | config DM_SWITCH | ||
416 | tristate "Switch target support (EXPERIMENTAL)" | ||
417 | depends on BLK_DEV_DM | ||
418 | ---help--- | ||
419 | This device-mapper target creates a device that supports an arbitrary | ||
420 | mapping of fixed-size regions of I/O across a fixed set of paths. | ||
421 | The path used for any specific region can be switched dynamically | ||
422 | by sending the target a message. | ||
423 | |||
424 | To compile this code as a module, choose M here: the module will | ||
425 | be called dm-switch. | ||
426 | |||
427 | If unsure, say N. | ||
428 | |||
415 | endif # MD | 429 | endif # MD |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 1439fd4ad9b1..5ef78efc27f2 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -40,6 +40,7 @@ obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o | |||
40 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o | 40 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o |
41 | obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o | 41 | obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o |
42 | obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o | 42 | obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o |
43 | obj-$(CONFIG_DM_SWITCH) += dm-switch.o | ||
43 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o | 44 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o |
44 | obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ | 45 | obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ |
45 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o | 46 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o |
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c new file mode 100644 index 000000000000..ff9ac4be4721 --- /dev/null +++ b/drivers/md/dm-switch.c | |||
@@ -0,0 +1,538 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2010-2012 by Dell Inc. All rights reserved. | ||
3 | * Copyright (C) 2011-2013 Red Hat, Inc. | ||
4 | * | ||
5 | * This file is released under the GPL. | ||
6 | * | ||
7 | * dm-switch is a device-mapper target that maps IO to underlying block | ||
8 | * devices efficiently when there are a large number of fixed-sized | ||
9 | * address regions but there is no simple pattern to allow for a compact | ||
10 | * mapping representation such as dm-stripe. | ||
11 | */ | ||
12 | |||
13 | #include <linux/device-mapper.h> | ||
14 | |||
15 | #include <linux/module.h> | ||
16 | #include <linux/init.h> | ||
17 | #include <linux/vmalloc.h> | ||
18 | |||
19 | #define DM_MSG_PREFIX "switch" | ||
20 | |||
21 | /* | ||
22 | * One region_table_slot_t holds <region_entries_per_slot> region table | ||
23 | * entries each of which is <region_table_entry_bits> in size. | ||
24 | */ | ||
25 | typedef unsigned long region_table_slot_t; | ||
26 | |||
27 | /* | ||
28 | * A device with the offset to its start sector. | ||
29 | */ | ||
30 | struct switch_path { | ||
31 | struct dm_dev *dmdev; | ||
32 | sector_t start; | ||
33 | }; | ||
34 | |||
35 | /* | ||
36 | * Context block for a dm switch device. | ||
37 | */ | ||
38 | struct switch_ctx { | ||
39 | struct dm_target *ti; | ||
40 | |||
41 | unsigned nr_paths; /* Number of paths in path_list. */ | ||
42 | |||
43 | unsigned region_size; /* Region size in 512-byte sectors */ | ||
44 | unsigned long nr_regions; /* Number of regions making up the device */ | ||
45 | signed char region_size_bits; /* log2 of region_size or -1 */ | ||
46 | |||
47 | unsigned char region_table_entry_bits; /* Number of bits in one region table entry */ | ||
48 | unsigned char region_entries_per_slot; /* Number of entries in one region table slot */ | ||
49 | signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */ | ||
50 | |||
51 | region_table_slot_t *region_table; /* Region table */ | ||
52 | |||
53 | /* | ||
54 | * Array of dm devices to switch between. | ||
55 | */ | ||
56 | struct switch_path path_list[0]; | ||
57 | }; | ||
58 | |||
59 | static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths, | ||
60 | unsigned region_size) | ||
61 | { | ||
62 | struct switch_ctx *sctx; | ||
63 | |||
64 | sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path), | ||
65 | GFP_KERNEL); | ||
66 | if (!sctx) | ||
67 | return NULL; | ||
68 | |||
69 | sctx->ti = ti; | ||
70 | sctx->region_size = region_size; | ||
71 | |||
72 | ti->private = sctx; | ||
73 | |||
74 | return sctx; | ||
75 | } | ||
76 | |||
77 | static int alloc_region_table(struct dm_target *ti, unsigned nr_paths) | ||
78 | { | ||
79 | struct switch_ctx *sctx = ti->private; | ||
80 | sector_t nr_regions = ti->len; | ||
81 | sector_t nr_slots; | ||
82 | |||
83 | if (!(sctx->region_size & (sctx->region_size - 1))) | ||
84 | sctx->region_size_bits = __ffs(sctx->region_size); | ||
85 | else | ||
86 | sctx->region_size_bits = -1; | ||
87 | |||
88 | sctx->region_table_entry_bits = 1; | ||
89 | while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 && | ||
90 | (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths) | ||
91 | sctx->region_table_entry_bits++; | ||
92 | |||
93 | sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits; | ||
94 | if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1))) | ||
95 | sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot); | ||
96 | else | ||
97 | sctx->region_entries_per_slot_bits = -1; | ||
98 | |||
99 | if (sector_div(nr_regions, sctx->region_size)) | ||
100 | nr_regions++; | ||
101 | |||
102 | sctx->nr_regions = nr_regions; | ||
103 | if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) { | ||
104 | ti->error = "Region table too large"; | ||
105 | return -EINVAL; | ||
106 | } | ||
107 | |||
108 | nr_slots = nr_regions; | ||
109 | if (sector_div(nr_slots, sctx->region_entries_per_slot)) | ||
110 | nr_slots++; | ||
111 | |||
112 | if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) { | ||
113 | ti->error = "Region table too large"; | ||
114 | return -EINVAL; | ||
115 | } | ||
116 | |||
117 | sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t)); | ||
118 | if (!sctx->region_table) { | ||
119 | ti->error = "Cannot allocate region table"; | ||
120 | return -ENOMEM; | ||
121 | } | ||
122 | |||
123 | return 0; | ||
124 | } | ||
125 | |||
126 | static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr, | ||
127 | unsigned long *region_index, unsigned *bit) | ||
128 | { | ||
129 | if (sctx->region_entries_per_slot_bits >= 0) { | ||
130 | *region_index = region_nr >> sctx->region_entries_per_slot_bits; | ||
131 | *bit = region_nr & (sctx->region_entries_per_slot - 1); | ||
132 | } else { | ||
133 | *region_index = region_nr / sctx->region_entries_per_slot; | ||
134 | *bit = region_nr % sctx->region_entries_per_slot; | ||
135 | } | ||
136 | |||
137 | *bit *= sctx->region_table_entry_bits; | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | * Find which path to use at given offset. | ||
142 | */ | ||
143 | static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) | ||
144 | { | ||
145 | unsigned long region_index; | ||
146 | unsigned bit, path_nr; | ||
147 | sector_t p; | ||
148 | |||
149 | p = offset; | ||
150 | if (sctx->region_size_bits >= 0) | ||
151 | p >>= sctx->region_size_bits; | ||
152 | else | ||
153 | sector_div(p, sctx->region_size); | ||
154 | |||
155 | switch_get_position(sctx, p, ®ion_index, &bit); | ||
156 | path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & | ||
157 | ((1 << sctx->region_table_entry_bits) - 1); | ||
158 | |||
159 | /* This can only happen if the processor uses non-atomic stores. */ | ||
160 | if (unlikely(path_nr >= sctx->nr_paths)) | ||
161 | path_nr = 0; | ||
162 | |||
163 | return path_nr; | ||
164 | } | ||
165 | |||
166 | static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr, | ||
167 | unsigned value) | ||
168 | { | ||
169 | unsigned long region_index; | ||
170 | unsigned bit; | ||
171 | region_table_slot_t pte; | ||
172 | |||
173 | switch_get_position(sctx, region_nr, ®ion_index, &bit); | ||
174 | |||
175 | pte = sctx->region_table[region_index]; | ||
176 | pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit); | ||
177 | pte |= (region_table_slot_t)value << bit; | ||
178 | sctx->region_table[region_index] = pte; | ||
179 | } | ||
180 | |||
181 | /* | ||
182 | * Fill the region table with an initial round robin pattern. | ||
183 | */ | ||
184 | static void initialise_region_table(struct switch_ctx *sctx) | ||
185 | { | ||
186 | unsigned path_nr = 0; | ||
187 | unsigned long region_nr; | ||
188 | |||
189 | for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) { | ||
190 | switch_region_table_write(sctx, region_nr, path_nr); | ||
191 | if (++path_nr >= sctx->nr_paths) | ||
192 | path_nr = 0; | ||
193 | } | ||
194 | } | ||
195 | |||
196 | static int parse_path(struct dm_arg_set *as, struct dm_target *ti) | ||
197 | { | ||
198 | struct switch_ctx *sctx = ti->private; | ||
199 | unsigned long long start; | ||
200 | int r; | ||
201 | |||
202 | r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), | ||
203 | &sctx->path_list[sctx->nr_paths].dmdev); | ||
204 | if (r) { | ||
205 | ti->error = "Device lookup failed"; | ||
206 | return r; | ||
207 | } | ||
208 | |||
209 | if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) { | ||
210 | ti->error = "Invalid device starting offset"; | ||
211 | dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); | ||
212 | return -EINVAL; | ||
213 | } | ||
214 | |||
215 | sctx->path_list[sctx->nr_paths].start = start; | ||
216 | |||
217 | sctx->nr_paths++; | ||
218 | |||
219 | return 0; | ||
220 | } | ||
221 | |||
222 | /* | ||
223 | * Destructor: Don't free the dm_target, just the ti->private data (if any). | ||
224 | */ | ||
225 | static void switch_dtr(struct dm_target *ti) | ||
226 | { | ||
227 | struct switch_ctx *sctx = ti->private; | ||
228 | |||
229 | while (sctx->nr_paths--) | ||
230 | dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); | ||
231 | |||
232 | vfree(sctx->region_table); | ||
233 | kfree(sctx); | ||
234 | } | ||
235 | |||
236 | /* | ||
237 | * Constructor arguments: | ||
238 | * <num_paths> <region_size> <num_optional_args> [<optional_args>...] | ||
239 | * [<dev_path> <offset>]+ | ||
240 | * | ||
241 | * Optional args are to allow for future extension: currently this | ||
242 | * parameter must be 0. | ||
243 | */ | ||
244 | static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv) | ||
245 | { | ||
246 | static struct dm_arg _args[] = { | ||
247 | {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"}, | ||
248 | {1, UINT_MAX, "Invalid region size"}, | ||
249 | {0, 0, "Invalid number of optional args"}, | ||
250 | }; | ||
251 | |||
252 | struct switch_ctx *sctx; | ||
253 | struct dm_arg_set as; | ||
254 | unsigned nr_paths, region_size, nr_optional_args; | ||
255 | int r; | ||
256 | |||
257 | as.argc = argc; | ||
258 | as.argv = argv; | ||
259 | |||
260 | r = dm_read_arg(_args, &as, &nr_paths, &ti->error); | ||
261 | if (r) | ||
262 | return -EINVAL; | ||
263 | |||
264 | r = dm_read_arg(_args + 1, &as, ®ion_size, &ti->error); | ||
265 | if (r) | ||
266 | return r; | ||
267 | |||
268 | r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error); | ||
269 | if (r) | ||
270 | return r; | ||
271 | /* parse optional arguments here, if we add any */ | ||
272 | |||
273 | if (as.argc != nr_paths * 2) { | ||
274 | ti->error = "Incorrect number of path arguments"; | ||
275 | return -EINVAL; | ||
276 | } | ||
277 | |||
278 | sctx = alloc_switch_ctx(ti, nr_paths, region_size); | ||
279 | if (!sctx) { | ||
280 | ti->error = "Cannot allocate redirection context"; | ||
281 | return -ENOMEM; | ||
282 | } | ||
283 | |||
284 | r = dm_set_target_max_io_len(ti, region_size); | ||
285 | if (r) | ||
286 | goto error; | ||
287 | |||
288 | while (as.argc) { | ||
289 | r = parse_path(&as, ti); | ||
290 | if (r) | ||
291 | goto error; | ||
292 | } | ||
293 | |||
294 | r = alloc_region_table(ti, nr_paths); | ||
295 | if (r) | ||
296 | goto error; | ||
297 | |||
298 | initialise_region_table(sctx); | ||
299 | |||
300 | /* For UNMAP, sending the request down any path is sufficient */ | ||
301 | ti->num_discard_bios = 1; | ||
302 | |||
303 | return 0; | ||
304 | |||
305 | error: | ||
306 | switch_dtr(ti); | ||
307 | |||
308 | return r; | ||
309 | } | ||
310 | |||
311 | static int switch_map(struct dm_target *ti, struct bio *bio) | ||
312 | { | ||
313 | struct switch_ctx *sctx = ti->private; | ||
314 | sector_t offset = dm_target_offset(ti, bio->bi_sector); | ||
315 | unsigned path_nr = switch_get_path_nr(sctx, offset); | ||
316 | |||
317 | bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev; | ||
318 | bio->bi_sector = sctx->path_list[path_nr].start + offset; | ||
319 | |||
320 | return DM_MAPIO_REMAPPED; | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * We need to parse hex numbers in the message as quickly as possible. | ||
325 | * | ||
326 | * This table-based hex parser improves performance. | ||
327 | * It improves a time to load 1000000 entries compared to the condition-based | ||
328 | * parser. | ||
329 | * table-based parser condition-based parser | ||
330 | * PA-RISC 0.29s 0.31s | ||
331 | * Opteron 0.0495s 0.0498s | ||
332 | */ | ||
333 | static const unsigned char hex_table[256] = { | ||
334 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
335 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
336 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
337 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, | ||
338 | 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
339 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
340 | 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
341 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
342 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
343 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
344 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
345 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
346 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
347 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
348 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | ||
349 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 | ||
350 | }; | ||
351 | |||
352 | static __always_inline unsigned long parse_hex(const char **string) | ||
353 | { | ||
354 | unsigned char d; | ||
355 | unsigned long r = 0; | ||
356 | |||
357 | while ((d = hex_table[(unsigned char)**string]) < 16) { | ||
358 | r = (r << 4) | d; | ||
359 | (*string)++; | ||
360 | } | ||
361 | |||
362 | return r; | ||
363 | } | ||
364 | |||
365 | static int process_set_region_mappings(struct switch_ctx *sctx, | ||
366 | unsigned argc, char **argv) | ||
367 | { | ||
368 | unsigned i; | ||
369 | unsigned long region_index = 0; | ||
370 | |||
371 | for (i = 1; i < argc; i++) { | ||
372 | unsigned long path_nr; | ||
373 | const char *string = argv[i]; | ||
374 | |||
375 | if (*string == ':') | ||
376 | region_index++; | ||
377 | else { | ||
378 | region_index = parse_hex(&string); | ||
379 | if (unlikely(*string != ':')) { | ||
380 | DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); | ||
381 | return -EINVAL; | ||
382 | } | ||
383 | } | ||
384 | |||
385 | string++; | ||
386 | if (unlikely(!*string)) { | ||
387 | DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); | ||
388 | return -EINVAL; | ||
389 | } | ||
390 | |||
391 | path_nr = parse_hex(&string); | ||
392 | if (unlikely(*string)) { | ||
393 | DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); | ||
394 | return -EINVAL; | ||
395 | } | ||
396 | if (unlikely(region_index >= sctx->nr_regions)) { | ||
397 | DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions); | ||
398 | return -EINVAL; | ||
399 | } | ||
400 | if (unlikely(path_nr >= sctx->nr_paths)) { | ||
401 | DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths); | ||
402 | return -EINVAL; | ||
403 | } | ||
404 | |||
405 | switch_region_table_write(sctx, region_index, path_nr); | ||
406 | } | ||
407 | |||
408 | return 0; | ||
409 | } | ||
410 | |||
411 | /* | ||
412 | * Messages are processed one-at-a-time. | ||
413 | * | ||
414 | * Only set_region_mappings is supported. | ||
415 | */ | ||
416 | static int switch_message(struct dm_target *ti, unsigned argc, char **argv) | ||
417 | { | ||
418 | static DEFINE_MUTEX(message_mutex); | ||
419 | |||
420 | struct switch_ctx *sctx = ti->private; | ||
421 | int r = -EINVAL; | ||
422 | |||
423 | mutex_lock(&message_mutex); | ||
424 | |||
425 | if (!strcasecmp(argv[0], "set_region_mappings")) | ||
426 | r = process_set_region_mappings(sctx, argc, argv); | ||
427 | else | ||
428 | DMWARN("Unrecognised message received."); | ||
429 | |||
430 | mutex_unlock(&message_mutex); | ||
431 | |||
432 | return r; | ||
433 | } | ||
434 | |||
435 | static void switch_status(struct dm_target *ti, status_type_t type, | ||
436 | unsigned status_flags, char *result, unsigned maxlen) | ||
437 | { | ||
438 | struct switch_ctx *sctx = ti->private; | ||
439 | unsigned sz = 0; | ||
440 | int path_nr; | ||
441 | |||
442 | switch (type) { | ||
443 | case STATUSTYPE_INFO: | ||
444 | result[0] = '\0'; | ||
445 | break; | ||
446 | |||
447 | case STATUSTYPE_TABLE: | ||
448 | DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size); | ||
449 | for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) | ||
450 | DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name, | ||
451 | (unsigned long long)sctx->path_list[path_nr].start); | ||
452 | break; | ||
453 | } | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * Switch ioctl: | ||
458 | * | ||
459 | * Passthrough all ioctls to the path for sector 0 | ||
460 | */ | ||
461 | static int switch_ioctl(struct dm_target *ti, unsigned cmd, | ||
462 | unsigned long arg) | ||
463 | { | ||
464 | struct switch_ctx *sctx = ti->private; | ||
465 | struct block_device *bdev; | ||
466 | fmode_t mode; | ||
467 | unsigned path_nr; | ||
468 | int r = 0; | ||
469 | |||
470 | path_nr = switch_get_path_nr(sctx, 0); | ||
471 | |||
472 | bdev = sctx->path_list[path_nr].dmdev->bdev; | ||
473 | mode = sctx->path_list[path_nr].dmdev->mode; | ||
474 | |||
475 | /* | ||
476 | * Only pass ioctls through if the device sizes match exactly. | ||
477 | */ | ||
478 | if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) | ||
479 | r = scsi_verify_blk_ioctl(NULL, cmd); | ||
480 | |||
481 | return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); | ||
482 | } | ||
483 | |||
484 | static int switch_iterate_devices(struct dm_target *ti, | ||
485 | iterate_devices_callout_fn fn, void *data) | ||
486 | { | ||
487 | struct switch_ctx *sctx = ti->private; | ||
488 | int path_nr; | ||
489 | int r; | ||
490 | |||
491 | for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) { | ||
492 | r = fn(ti, sctx->path_list[path_nr].dmdev, | ||
493 | sctx->path_list[path_nr].start, ti->len, data); | ||
494 | if (r) | ||
495 | return r; | ||
496 | } | ||
497 | |||
498 | return 0; | ||
499 | } | ||
500 | |||
501 | static struct target_type switch_target = { | ||
502 | .name = "switch", | ||
503 | .version = {1, 0, 0}, | ||
504 | .module = THIS_MODULE, | ||
505 | .ctr = switch_ctr, | ||
506 | .dtr = switch_dtr, | ||
507 | .map = switch_map, | ||
508 | .message = switch_message, | ||
509 | .status = switch_status, | ||
510 | .ioctl = switch_ioctl, | ||
511 | .iterate_devices = switch_iterate_devices, | ||
512 | }; | ||
513 | |||
514 | static int __init dm_switch_init(void) | ||
515 | { | ||
516 | int r; | ||
517 | |||
518 | r = dm_register_target(&switch_target); | ||
519 | if (r < 0) | ||
520 | DMERR("dm_register_target() failed %d", r); | ||
521 | |||
522 | return r; | ||
523 | } | ||
524 | |||
525 | static void __exit dm_switch_exit(void) | ||
526 | { | ||
527 | dm_unregister_target(&switch_target); | ||
528 | } | ||
529 | |||
530 | module_init(dm_switch_init); | ||
531 | module_exit(dm_switch_exit); | ||
532 | |||
533 | MODULE_DESCRIPTION(DM_NAME " dynamic path switching target"); | ||
534 | MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>"); | ||
535 | MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>"); | ||
536 | MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>"); | ||
537 | MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>"); | ||
538 | MODULE_LICENSE("GPL"); | ||