diff options
author | Kiyoshi Ueda <k-ueda@ct.jp.nec.com> | 2009-06-22 05:12:28 -0400 |
---|---|---|
committer | Alasdair G Kergon <agk@redhat.com> | 2009-06-22 05:12:28 -0400 |
commit | f392ba889b019602976082bfe7bf486c2594f85c (patch) | |
tree | 962e8f354dfe3df2021476412be8d1bcec8a03d0 | |
parent | fd5e033908b7b743b5650790f196761dd930f988 (diff) |
dm mpath: add service time load balancer
This patch adds a service time oriented dynamic load balancer,
dm-service-time, which selects the path with the shortest estimated
service time for the incoming I/O.
The service time is estimated by dividing the in-flight I/O size
by a performance value of each path.
The performance value can be given as a table argument at the table
loading time. If no performance value is given, all paths are
considered equal.
Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
-rw-r--r-- | Documentation/device-mapper/dm-service-time.txt | 91 | ||||
-rw-r--r-- | drivers/md/Kconfig | 10 | ||||
-rw-r--r-- | drivers/md/Makefile | 1 | ||||
-rw-r--r-- | drivers/md/dm-service-time.c | 339 |
4 files changed, 441 insertions, 0 deletions
diff --git a/Documentation/device-mapper/dm-service-time.txt b/Documentation/device-mapper/dm-service-time.txt new file mode 100644 index 000000000000..7d00668e97bb --- /dev/null +++ b/Documentation/device-mapper/dm-service-time.txt | |||
@@ -0,0 +1,91 @@ | |||
1 | dm-service-time | ||
2 | =============== | ||
3 | |||
4 | dm-service-time is a path selector module for device-mapper targets, | ||
5 | which selects a path with the shortest estimated service time for | ||
6 | the incoming I/O. | ||
7 | |||
8 | The service time for each path is estimated by dividing the total size | ||
9 | of in-flight I/Os on a path with the performance value of the path. | ||
10 | The performance value is a relative throughput value among all paths | ||
11 | in a path-group, and it can be specified as a table argument. | ||
12 | |||
13 | The path selector name is 'service-time'. | ||
14 | |||
15 | Table parameters for each path: [<repeat_count> [<relative_throughput>]] | ||
16 | <repeat_count>: The number of I/Os to dispatch using the selected | ||
17 | path before switching to the next path. | ||
18 | If not given, internal default is used. To check | ||
19 | the default value, see the activated table. | ||
20 | <relative_throughput>: The relative throughput value of the path | ||
21 | among all paths in the path-group. | ||
22 | The valid range is 0-100. | ||
23 | If not given, minimum value '1' is used. | ||
24 | If '0' is given, the path isn't selected while | ||
25 | other paths having a positive value are available. | ||
26 | |||
27 | Status for each path: <status> <fail-count> <in-flight-size> \ | ||
28 | <relative_throughput> | ||
29 | <status>: 'A' if the path is active, 'F' if the path is failed. | ||
30 | <fail-count>: The number of path failures. | ||
31 | <in-flight-size>: The size of in-flight I/Os on the path. | ||
32 | <relative_throughput>: The relative throughput value of the path | ||
33 | among all paths in the path-group. | ||
34 | |||
35 | |||
36 | Algorithm | ||
37 | ========= | ||
38 | |||
39 | dm-service-time adds the I/O size to 'in-flight-size' when the I/O is | ||
40 | dispatched and substracts when completed. | ||
41 | Basically, dm-service-time selects a path having minimum service time | ||
42 | which is calculated by: | ||
43 | |||
44 | ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput' | ||
45 | |||
46 | However, some optimizations below are used to reduce the calculation | ||
47 | as much as possible. | ||
48 | |||
49 | 1. If the paths have the same 'relative_throughput', skip | ||
50 | the division and just compare the 'in-flight-size'. | ||
51 | |||
52 | 2. If the paths have the same 'in-flight-size', skip the division | ||
53 | and just compare the 'relative_throughput'. | ||
54 | |||
55 | 3. If some paths have non-zero 'relative_throughput' and others | ||
56 | have zero 'relative_throughput', ignore those paths with zero | ||
57 | 'relative_throughput'. | ||
58 | |||
59 | If such optimizations can't be applied, calculate service time, and | ||
60 | compare service time. | ||
61 | If calculated service time is equal, the path having maximum | ||
62 | 'relative_throughput' may be better. So compare 'relative_throughput' | ||
63 | then. | ||
64 | |||
65 | |||
66 | Examples | ||
67 | ======== | ||
68 | In case that 2 paths (sda and sdb) are used with repeat_count == 128 | ||
69 | and sda has an average throughput 1GB/s and sdb has 4GB/s, | ||
70 | 'relative_throughput' value may be '1' for sda and '4' for sdb. | ||
71 | |||
72 | # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \ | ||
73 | dmsetup create test | ||
74 | # | ||
75 | # dmsetup table | ||
76 | test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4 | ||
77 | # | ||
78 | # dmsetup status | ||
79 | test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4 | ||
80 | |||
81 | |||
82 | Or '2' for sda and '8' for sdb would be also true. | ||
83 | |||
84 | # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \ | ||
85 | dmsetup create test | ||
86 | # | ||
87 | # dmsetup table | ||
88 | test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8 | ||
89 | # | ||
90 | # dmsetup status | ||
91 | test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8 | ||
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 3b311d273346..09f93fa68912 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -258,6 +258,16 @@ config DM_MULTIPATH_QL | |||
258 | 258 | ||
259 | If unsure, say N. | 259 | If unsure, say N. |
260 | 260 | ||
261 | config DM_MULTIPATH_ST | ||
262 | tristate "I/O Path Selector based on the service time" | ||
263 | depends on DM_MULTIPATH | ||
264 | ---help--- | ||
265 | This path selector is a dynamic load balancer which selects | ||
266 | the path expected to complete the incoming I/O in the shortest | ||
267 | time. | ||
268 | |||
269 | If unsure, say N. | ||
270 | |||
261 | config DM_DELAY | 271 | config DM_DELAY |
262 | tristate "I/O delaying target (EXPERIMENTAL)" | 272 | tristate "I/O delaying target (EXPERIMENTAL)" |
263 | depends on BLK_DEV_DM && EXPERIMENTAL | 273 | depends on BLK_DEV_DM && EXPERIMENTAL |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index ff9f545dd516..dade52f60733 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -37,6 +37,7 @@ obj-$(CONFIG_DM_CRYPT) += dm-crypt.o | |||
37 | obj-$(CONFIG_DM_DELAY) += dm-delay.o | 37 | obj-$(CONFIG_DM_DELAY) += dm-delay.o |
38 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o | 38 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o |
39 | obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o | 39 | obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o |
40 | obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o | ||
40 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o | 41 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o |
41 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o | 42 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o |
42 | obj-$(CONFIG_DM_ZERO) += dm-zero.o | 43 | obj-$(CONFIG_DM_ZERO) += dm-zero.o |
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c new file mode 100644 index 000000000000..cfa668f46c40 --- /dev/null +++ b/drivers/md/dm-service-time.c | |||
@@ -0,0 +1,339 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. | ||
3 | * | ||
4 | * Module Author: Kiyoshi Ueda | ||
5 | * | ||
6 | * This file is released under the GPL. | ||
7 | * | ||
8 | * Throughput oriented path selector. | ||
9 | */ | ||
10 | |||
11 | #include "dm.h" | ||
12 | #include "dm-path-selector.h" | ||
13 | |||
14 | #define DM_MSG_PREFIX "multipath service-time" | ||
15 | #define ST_MIN_IO 1 | ||
16 | #define ST_MAX_RELATIVE_THROUGHPUT 100 | ||
17 | #define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 | ||
18 | #define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) | ||
19 | #define ST_VERSION "0.2.0" | ||
20 | |||
21 | struct selector { | ||
22 | struct list_head valid_paths; | ||
23 | struct list_head failed_paths; | ||
24 | }; | ||
25 | |||
26 | struct path_info { | ||
27 | struct list_head list; | ||
28 | struct dm_path *path; | ||
29 | unsigned repeat_count; | ||
30 | unsigned relative_throughput; | ||
31 | atomic_t in_flight_size; /* Total size of in-flight I/Os */ | ||
32 | }; | ||
33 | |||
34 | static struct selector *alloc_selector(void) | ||
35 | { | ||
36 | struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
37 | |||
38 | if (s) { | ||
39 | INIT_LIST_HEAD(&s->valid_paths); | ||
40 | INIT_LIST_HEAD(&s->failed_paths); | ||
41 | } | ||
42 | |||
43 | return s; | ||
44 | } | ||
45 | |||
46 | static int st_create(struct path_selector *ps, unsigned argc, char **argv) | ||
47 | { | ||
48 | struct selector *s = alloc_selector(); | ||
49 | |||
50 | if (!s) | ||
51 | return -ENOMEM; | ||
52 | |||
53 | ps->context = s; | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static void free_paths(struct list_head *paths) | ||
58 | { | ||
59 | struct path_info *pi, *next; | ||
60 | |||
61 | list_for_each_entry_safe(pi, next, paths, list) { | ||
62 | list_del(&pi->list); | ||
63 | kfree(pi); | ||
64 | } | ||
65 | } | ||
66 | |||
67 | static void st_destroy(struct path_selector *ps) | ||
68 | { | ||
69 | struct selector *s = ps->context; | ||
70 | |||
71 | free_paths(&s->valid_paths); | ||
72 | free_paths(&s->failed_paths); | ||
73 | kfree(s); | ||
74 | ps->context = NULL; | ||
75 | } | ||
76 | |||
77 | static int st_status(struct path_selector *ps, struct dm_path *path, | ||
78 | status_type_t type, char *result, unsigned maxlen) | ||
79 | { | ||
80 | unsigned sz = 0; | ||
81 | struct path_info *pi; | ||
82 | |||
83 | if (!path) | ||
84 | DMEMIT("0 "); | ||
85 | else { | ||
86 | pi = path->pscontext; | ||
87 | |||
88 | switch (type) { | ||
89 | case STATUSTYPE_INFO: | ||
90 | DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), | ||
91 | pi->relative_throughput); | ||
92 | break; | ||
93 | case STATUSTYPE_TABLE: | ||
94 | DMEMIT("%u %u ", pi->repeat_count, | ||
95 | pi->relative_throughput); | ||
96 | break; | ||
97 | } | ||
98 | } | ||
99 | |||
100 | return sz; | ||
101 | } | ||
102 | |||
103 | static int st_add_path(struct path_selector *ps, struct dm_path *path, | ||
104 | int argc, char **argv, char **error) | ||
105 | { | ||
106 | struct selector *s = ps->context; | ||
107 | struct path_info *pi; | ||
108 | unsigned repeat_count = ST_MIN_IO; | ||
109 | unsigned relative_throughput = 1; | ||
110 | |||
111 | /* | ||
112 | * Arguments: [<repeat_count> [<relative_throughput>]] | ||
113 | * <repeat_count>: The number of I/Os before switching path. | ||
114 | * If not given, default (ST_MIN_IO) is used. | ||
115 | * <relative_throughput>: The relative throughput value of | ||
116 | * the path among all paths in the path-group. | ||
117 | * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT> | ||
118 | * If not given, minimum value '1' is used. | ||
119 | * If '0' is given, the path isn't selected while | ||
120 | * other paths having a positive value are | ||
121 | * available. | ||
122 | */ | ||
123 | if (argc > 2) { | ||
124 | *error = "service-time ps: incorrect number of arguments"; | ||
125 | return -EINVAL; | ||
126 | } | ||
127 | |||
128 | if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | ||
129 | *error = "service-time ps: invalid repeat count"; | ||
130 | return -EINVAL; | ||
131 | } | ||
132 | |||
133 | if ((argc == 2) && | ||
134 | (sscanf(argv[1], "%u", &relative_throughput) != 1 || | ||
135 | relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { | ||
136 | *error = "service-time ps: invalid relative_throughput value"; | ||
137 | return -EINVAL; | ||
138 | } | ||
139 | |||
140 | /* allocate the path */ | ||
141 | pi = kmalloc(sizeof(*pi), GFP_KERNEL); | ||
142 | if (!pi) { | ||
143 | *error = "service-time ps: Error allocating path context"; | ||
144 | return -ENOMEM; | ||
145 | } | ||
146 | |||
147 | pi->path = path; | ||
148 | pi->repeat_count = repeat_count; | ||
149 | pi->relative_throughput = relative_throughput; | ||
150 | atomic_set(&pi->in_flight_size, 0); | ||
151 | |||
152 | path->pscontext = pi; | ||
153 | |||
154 | list_add_tail(&pi->list, &s->valid_paths); | ||
155 | |||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | static void st_fail_path(struct path_selector *ps, struct dm_path *path) | ||
160 | { | ||
161 | struct selector *s = ps->context; | ||
162 | struct path_info *pi = path->pscontext; | ||
163 | |||
164 | list_move(&pi->list, &s->failed_paths); | ||
165 | } | ||
166 | |||
167 | static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) | ||
168 | { | ||
169 | struct selector *s = ps->context; | ||
170 | struct path_info *pi = path->pscontext; | ||
171 | |||
172 | list_move_tail(&pi->list, &s->valid_paths); | ||
173 | |||
174 | return 0; | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * Compare the estimated service time of 2 paths, pi1 and pi2, | ||
179 | * for the incoming I/O. | ||
180 | * | ||
181 | * Returns: | ||
182 | * < 0 : pi1 is better | ||
183 | * 0 : no difference between pi1 and pi2 | ||
184 | * > 0 : pi2 is better | ||
185 | * | ||
186 | * Description: | ||
187 | * Basically, the service time is estimated by: | ||
188 | * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' | ||
189 | * To reduce the calculation, some optimizations are made. | ||
190 | * (See comments inline) | ||
191 | */ | ||
192 | static int st_compare_load(struct path_info *pi1, struct path_info *pi2, | ||
193 | size_t incoming) | ||
194 | { | ||
195 | size_t sz1, sz2, st1, st2; | ||
196 | |||
197 | sz1 = atomic_read(&pi1->in_flight_size); | ||
198 | sz2 = atomic_read(&pi2->in_flight_size); | ||
199 | |||
200 | /* | ||
201 | * Case 1: Both have same throughput value. Choose less loaded path. | ||
202 | */ | ||
203 | if (pi1->relative_throughput == pi2->relative_throughput) | ||
204 | return sz1 - sz2; | ||
205 | |||
206 | /* | ||
207 | * Case 2a: Both have same load. Choose higher throughput path. | ||
208 | * Case 2b: One path has no throughput value. Choose the other one. | ||
209 | */ | ||
210 | if (sz1 == sz2 || | ||
211 | !pi1->relative_throughput || !pi2->relative_throughput) | ||
212 | return pi2->relative_throughput - pi1->relative_throughput; | ||
213 | |||
214 | /* | ||
215 | * Case 3: Calculate service time. Choose faster path. | ||
216 | * Service time using pi1: | ||
217 | * st1 = (sz1 + incoming) / pi1->relative_throughput | ||
218 | * Service time using pi2: | ||
219 | * st2 = (sz2 + incoming) / pi2->relative_throughput | ||
220 | * | ||
221 | * To avoid the division, transform the expression to use | ||
222 | * multiplication. | ||
223 | * Because ->relative_throughput > 0 here, if st1 < st2, | ||
224 | * the expressions below are the same meaning: | ||
225 | * (sz1 + incoming) / pi1->relative_throughput < | ||
226 | * (sz2 + incoming) / pi2->relative_throughput | ||
227 | * (sz1 + incoming) * pi2->relative_throughput < | ||
228 | * (sz2 + incoming) * pi1->relative_throughput | ||
229 | * So use the later one. | ||
230 | */ | ||
231 | sz1 += incoming; | ||
232 | sz2 += incoming; | ||
233 | if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || | ||
234 | sz2 >= ST_MAX_INFLIGHT_SIZE)) { | ||
235 | /* | ||
236 | * Size may be too big for multiplying pi->relative_throughput | ||
237 | * and overflow. | ||
238 | * To avoid the overflow and mis-selection, shift down both. | ||
239 | */ | ||
240 | sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; | ||
241 | sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; | ||
242 | } | ||
243 | st1 = sz1 * pi2->relative_throughput; | ||
244 | st2 = sz2 * pi1->relative_throughput; | ||
245 | if (st1 != st2) | ||
246 | return st1 - st2; | ||
247 | |||
248 | /* | ||
249 | * Case 4: Service time is equal. Choose higher throughput path. | ||
250 | */ | ||
251 | return pi2->relative_throughput - pi1->relative_throughput; | ||
252 | } | ||
253 | |||
254 | static struct dm_path *st_select_path(struct path_selector *ps, | ||
255 | unsigned *repeat_count, size_t nr_bytes) | ||
256 | { | ||
257 | struct selector *s = ps->context; | ||
258 | struct path_info *pi = NULL, *best = NULL; | ||
259 | |||
260 | if (list_empty(&s->valid_paths)) | ||
261 | return NULL; | ||
262 | |||
263 | /* Change preferred (first in list) path to evenly balance. */ | ||
264 | list_move_tail(s->valid_paths.next, &s->valid_paths); | ||
265 | |||
266 | list_for_each_entry(pi, &s->valid_paths, list) | ||
267 | if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) | ||
268 | best = pi; | ||
269 | |||
270 | if (!best) | ||
271 | return NULL; | ||
272 | |||
273 | *repeat_count = best->repeat_count; | ||
274 | |||
275 | return best->path; | ||
276 | } | ||
277 | |||
278 | static int st_start_io(struct path_selector *ps, struct dm_path *path, | ||
279 | size_t nr_bytes) | ||
280 | { | ||
281 | struct path_info *pi = path->pscontext; | ||
282 | |||
283 | atomic_add(nr_bytes, &pi->in_flight_size); | ||
284 | |||
285 | return 0; | ||
286 | } | ||
287 | |||
288 | static int st_end_io(struct path_selector *ps, struct dm_path *path, | ||
289 | size_t nr_bytes) | ||
290 | { | ||
291 | struct path_info *pi = path->pscontext; | ||
292 | |||
293 | atomic_sub(nr_bytes, &pi->in_flight_size); | ||
294 | |||
295 | return 0; | ||
296 | } | ||
297 | |||
298 | static struct path_selector_type st_ps = { | ||
299 | .name = "service-time", | ||
300 | .module = THIS_MODULE, | ||
301 | .table_args = 2, | ||
302 | .info_args = 2, | ||
303 | .create = st_create, | ||
304 | .destroy = st_destroy, | ||
305 | .status = st_status, | ||
306 | .add_path = st_add_path, | ||
307 | .fail_path = st_fail_path, | ||
308 | .reinstate_path = st_reinstate_path, | ||
309 | .select_path = st_select_path, | ||
310 | .start_io = st_start_io, | ||
311 | .end_io = st_end_io, | ||
312 | }; | ||
313 | |||
314 | static int __init dm_st_init(void) | ||
315 | { | ||
316 | int r = dm_register_path_selector(&st_ps); | ||
317 | |||
318 | if (r < 0) | ||
319 | DMERR("register failed %d", r); | ||
320 | |||
321 | DMINFO("version " ST_VERSION " loaded"); | ||
322 | |||
323 | return r; | ||
324 | } | ||
325 | |||
326 | static void __exit dm_st_exit(void) | ||
327 | { | ||
328 | int r = dm_unregister_path_selector(&st_ps); | ||
329 | |||
330 | if (r < 0) | ||
331 | DMERR("unregister failed %d", r); | ||
332 | } | ||
333 | |||
334 | module_init(dm_st_init); | ||
335 | module_exit(dm_st_exit); | ||
336 | |||
337 | MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); | ||
338 | MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>"); | ||
339 | MODULE_LICENSE("GPL"); | ||