aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKiyoshi Ueda <k-ueda@ct.jp.nec.com>2009-06-22 05:12:28 -0400
committerAlasdair G Kergon <agk@redhat.com>2009-06-22 05:12:28 -0400
commitf392ba889b019602976082bfe7bf486c2594f85c (patch)
tree962e8f354dfe3df2021476412be8d1bcec8a03d0
parentfd5e033908b7b743b5650790f196761dd930f988 (diff)
dm mpath: add service time load balancer
This patch adds a service time oriented dynamic load balancer, dm-service-time, which selects the path with the shortest estimated service time for the incoming I/O. The service time is estimated by dividing the in-flight I/O size by a performance value of each path. The performance value can be given as a table argument at the table loading time. If no performance value is given, all paths are considered equal. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
-rw-r--r--Documentation/device-mapper/dm-service-time.txt91
-rw-r--r--drivers/md/Kconfig10
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-service-time.c339
4 files changed, 441 insertions, 0 deletions
diff --git a/Documentation/device-mapper/dm-service-time.txt b/Documentation/device-mapper/dm-service-time.txt
new file mode 100644
index 000000000000..7d00668e97bb
--- /dev/null
+++ b/Documentation/device-mapper/dm-service-time.txt
@@ -0,0 +1,91 @@
1dm-service-time
2===============
3
4dm-service-time is a path selector module for device-mapper targets,
5which selects a path with the shortest estimated service time for
6the incoming I/O.
7
8The service time for each path is estimated by dividing the total size
9of in-flight I/Os on a path with the performance value of the path.
10The performance value is a relative throughput value among all paths
11in a path-group, and it can be specified as a table argument.
12
13The path selector name is 'service-time'.
14
15Table parameters for each path: [<repeat_count> [<relative_throughput>]]
16 <repeat_count>: The number of I/Os to dispatch using the selected
17 path before switching to the next path.
18 If not given, internal default is used. To check
19 the default value, see the activated table.
20 <relative_throughput>: The relative throughput value of the path
21 among all paths in the path-group.
22 The valid range is 0-100.
23 If not given, minimum value '1' is used.
24 If '0' is given, the path isn't selected while
25 other paths having a positive value are available.
26
27Status for each path: <status> <fail-count> <in-flight-size> \
28 <relative_throughput>
29 <status>: 'A' if the path is active, 'F' if the path is failed.
30 <fail-count>: The number of path failures.
31 <in-flight-size>: The size of in-flight I/Os on the path.
32 <relative_throughput>: The relative throughput value of the path
33 among all paths in the path-group.
34
35
36Algorithm
37=========
38
39dm-service-time adds the I/O size to 'in-flight-size' when the I/O is
40dispatched and substracts when completed.
41Basically, dm-service-time selects a path having minimum service time
42which is calculated by:
43
44 ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput'
45
46However, some optimizations below are used to reduce the calculation
47as much as possible.
48
49 1. If the paths have the same 'relative_throughput', skip
50 the division and just compare the 'in-flight-size'.
51
52 2. If the paths have the same 'in-flight-size', skip the division
53 and just compare the 'relative_throughput'.
54
55 3. If some paths have non-zero 'relative_throughput' and others
56 have zero 'relative_throughput', ignore those paths with zero
57 'relative_throughput'.
58
59If such optimizations can't be applied, calculate service time, and
60compare service time.
61If calculated service time is equal, the path having maximum
62'relative_throughput' may be better. So compare 'relative_throughput'
63then.
64
65
66Examples
67========
68In case that 2 paths (sda and sdb) are used with repeat_count == 128
69and sda has an average throughput 1GB/s and sdb has 4GB/s,
70'relative_throughput' value may be '1' for sda and '4' for sdb.
71
72# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \
73 dmsetup create test
74#
75# dmsetup table
76test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4
77#
78# dmsetup status
79test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4
80
81
82Or '2' for sda and '8' for sdb would be also true.
83
84# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \
85 dmsetup create test
86#
87# dmsetup table
88test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8
89#
90# dmsetup status
91test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3b311d273346..09f93fa68912 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -258,6 +258,16 @@ config DM_MULTIPATH_QL
258 258
259 If unsure, say N. 259 If unsure, say N.
260 260
261config DM_MULTIPATH_ST
262 tristate "I/O Path Selector based on the service time"
263 depends on DM_MULTIPATH
264 ---help---
265 This path selector is a dynamic load balancer which selects
266 the path expected to complete the incoming I/O in the shortest
267 time.
268
269 If unsure, say N.
270
261config DM_DELAY 271config DM_DELAY
262 tristate "I/O delaying target (EXPERIMENTAL)" 272 tristate "I/O delaying target (EXPERIMENTAL)"
263 depends on BLK_DEV_DM && EXPERIMENTAL 273 depends on BLK_DEV_DM && EXPERIMENTAL
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index ff9f545dd516..dade52f60733 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
37obj-$(CONFIG_DM_DELAY) += dm-delay.o 37obj-$(CONFIG_DM_DELAY) += dm-delay.o
38obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o 38obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
39obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o 39obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
40obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
40obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o 41obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
41obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o 42obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
42obj-$(CONFIG_DM_ZERO) += dm-zero.o 43obj-$(CONFIG_DM_ZERO) += dm-zero.o
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
new file mode 100644
index 000000000000..cfa668f46c40
--- /dev/null
+++ b/drivers/md/dm-service-time.c
@@ -0,0 +1,339 @@
1/*
2 * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved.
3 *
4 * Module Author: Kiyoshi Ueda
5 *
6 * This file is released under the GPL.
7 *
8 * Throughput oriented path selector.
9 */
10
11#include "dm.h"
12#include "dm-path-selector.h"
13
14#define DM_MSG_PREFIX "multipath service-time"
15#define ST_MIN_IO 1
16#define ST_MAX_RELATIVE_THROUGHPUT 100
17#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7
18#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
19#define ST_VERSION "0.2.0"
20
21struct selector {
22 struct list_head valid_paths;
23 struct list_head failed_paths;
24};
25
26struct path_info {
27 struct list_head list;
28 struct dm_path *path;
29 unsigned repeat_count;
30 unsigned relative_throughput;
31 atomic_t in_flight_size; /* Total size of in-flight I/Os */
32};
33
34static struct selector *alloc_selector(void)
35{
36 struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
37
38 if (s) {
39 INIT_LIST_HEAD(&s->valid_paths);
40 INIT_LIST_HEAD(&s->failed_paths);
41 }
42
43 return s;
44}
45
46static int st_create(struct path_selector *ps, unsigned argc, char **argv)
47{
48 struct selector *s = alloc_selector();
49
50 if (!s)
51 return -ENOMEM;
52
53 ps->context = s;
54 return 0;
55}
56
57static void free_paths(struct list_head *paths)
58{
59 struct path_info *pi, *next;
60
61 list_for_each_entry_safe(pi, next, paths, list) {
62 list_del(&pi->list);
63 kfree(pi);
64 }
65}
66
67static void st_destroy(struct path_selector *ps)
68{
69 struct selector *s = ps->context;
70
71 free_paths(&s->valid_paths);
72 free_paths(&s->failed_paths);
73 kfree(s);
74 ps->context = NULL;
75}
76
77static int st_status(struct path_selector *ps, struct dm_path *path,
78 status_type_t type, char *result, unsigned maxlen)
79{
80 unsigned sz = 0;
81 struct path_info *pi;
82
83 if (!path)
84 DMEMIT("0 ");
85 else {
86 pi = path->pscontext;
87
88 switch (type) {
89 case STATUSTYPE_INFO:
90 DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
91 pi->relative_throughput);
92 break;
93 case STATUSTYPE_TABLE:
94 DMEMIT("%u %u ", pi->repeat_count,
95 pi->relative_throughput);
96 break;
97 }
98 }
99
100 return sz;
101}
102
103static int st_add_path(struct path_selector *ps, struct dm_path *path,
104 int argc, char **argv, char **error)
105{
106 struct selector *s = ps->context;
107 struct path_info *pi;
108 unsigned repeat_count = ST_MIN_IO;
109 unsigned relative_throughput = 1;
110
111 /*
112 * Arguments: [<repeat_count> [<relative_throughput>]]
113 * <repeat_count>: The number of I/Os before switching path.
114 * If not given, default (ST_MIN_IO) is used.
115 * <relative_throughput>: The relative throughput value of
116 * the path among all paths in the path-group.
117 * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
118 * If not given, minimum value '1' is used.
119 * If '0' is given, the path isn't selected while
120 * other paths having a positive value are
121 * available.
122 */
123 if (argc > 2) {
124 *error = "service-time ps: incorrect number of arguments";
125 return -EINVAL;
126 }
127
128 if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
129 *error = "service-time ps: invalid repeat count";
130 return -EINVAL;
131 }
132
133 if ((argc == 2) &&
134 (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
135 relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
136 *error = "service-time ps: invalid relative_throughput value";
137 return -EINVAL;
138 }
139
140 /* allocate the path */
141 pi = kmalloc(sizeof(*pi), GFP_KERNEL);
142 if (!pi) {
143 *error = "service-time ps: Error allocating path context";
144 return -ENOMEM;
145 }
146
147 pi->path = path;
148 pi->repeat_count = repeat_count;
149 pi->relative_throughput = relative_throughput;
150 atomic_set(&pi->in_flight_size, 0);
151
152 path->pscontext = pi;
153
154 list_add_tail(&pi->list, &s->valid_paths);
155
156 return 0;
157}
158
159static void st_fail_path(struct path_selector *ps, struct dm_path *path)
160{
161 struct selector *s = ps->context;
162 struct path_info *pi = path->pscontext;
163
164 list_move(&pi->list, &s->failed_paths);
165}
166
167static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
168{
169 struct selector *s = ps->context;
170 struct path_info *pi = path->pscontext;
171
172 list_move_tail(&pi->list, &s->valid_paths);
173
174 return 0;
175}
176
177/*
178 * Compare the estimated service time of 2 paths, pi1 and pi2,
179 * for the incoming I/O.
180 *
181 * Returns:
182 * < 0 : pi1 is better
183 * 0 : no difference between pi1 and pi2
184 * > 0 : pi2 is better
185 *
186 * Description:
187 * Basically, the service time is estimated by:
188 * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
189 * To reduce the calculation, some optimizations are made.
190 * (See comments inline)
191 */
192static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
193 size_t incoming)
194{
195 size_t sz1, sz2, st1, st2;
196
197 sz1 = atomic_read(&pi1->in_flight_size);
198 sz2 = atomic_read(&pi2->in_flight_size);
199
200 /*
201 * Case 1: Both have same throughput value. Choose less loaded path.
202 */
203 if (pi1->relative_throughput == pi2->relative_throughput)
204 return sz1 - sz2;
205
206 /*
207 * Case 2a: Both have same load. Choose higher throughput path.
208 * Case 2b: One path has no throughput value. Choose the other one.
209 */
210 if (sz1 == sz2 ||
211 !pi1->relative_throughput || !pi2->relative_throughput)
212 return pi2->relative_throughput - pi1->relative_throughput;
213
214 /*
215 * Case 3: Calculate service time. Choose faster path.
216 * Service time using pi1:
217 * st1 = (sz1 + incoming) / pi1->relative_throughput
218 * Service time using pi2:
219 * st2 = (sz2 + incoming) / pi2->relative_throughput
220 *
221 * To avoid the division, transform the expression to use
222 * multiplication.
223 * Because ->relative_throughput > 0 here, if st1 < st2,
224 * the expressions below are the same meaning:
225 * (sz1 + incoming) / pi1->relative_throughput <
226 * (sz2 + incoming) / pi2->relative_throughput
227 * (sz1 + incoming) * pi2->relative_throughput <
228 * (sz2 + incoming) * pi1->relative_throughput
229 * So use the later one.
230 */
231 sz1 += incoming;
232 sz2 += incoming;
233 if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
234 sz2 >= ST_MAX_INFLIGHT_SIZE)) {
235 /*
236 * Size may be too big for multiplying pi->relative_throughput
237 * and overflow.
238 * To avoid the overflow and mis-selection, shift down both.
239 */
240 sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
241 sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
242 }
243 st1 = sz1 * pi2->relative_throughput;
244 st2 = sz2 * pi1->relative_throughput;
245 if (st1 != st2)
246 return st1 - st2;
247
248 /*
249 * Case 4: Service time is equal. Choose higher throughput path.
250 */
251 return pi2->relative_throughput - pi1->relative_throughput;
252}
253
254static struct dm_path *st_select_path(struct path_selector *ps,
255 unsigned *repeat_count, size_t nr_bytes)
256{
257 struct selector *s = ps->context;
258 struct path_info *pi = NULL, *best = NULL;
259
260 if (list_empty(&s->valid_paths))
261 return NULL;
262
263 /* Change preferred (first in list) path to evenly balance. */
264 list_move_tail(s->valid_paths.next, &s->valid_paths);
265
266 list_for_each_entry(pi, &s->valid_paths, list)
267 if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
268 best = pi;
269
270 if (!best)
271 return NULL;
272
273 *repeat_count = best->repeat_count;
274
275 return best->path;
276}
277
278static int st_start_io(struct path_selector *ps, struct dm_path *path,
279 size_t nr_bytes)
280{
281 struct path_info *pi = path->pscontext;
282
283 atomic_add(nr_bytes, &pi->in_flight_size);
284
285 return 0;
286}
287
288static int st_end_io(struct path_selector *ps, struct dm_path *path,
289 size_t nr_bytes)
290{
291 struct path_info *pi = path->pscontext;
292
293 atomic_sub(nr_bytes, &pi->in_flight_size);
294
295 return 0;
296}
297
298static struct path_selector_type st_ps = {
299 .name = "service-time",
300 .module = THIS_MODULE,
301 .table_args = 2,
302 .info_args = 2,
303 .create = st_create,
304 .destroy = st_destroy,
305 .status = st_status,
306 .add_path = st_add_path,
307 .fail_path = st_fail_path,
308 .reinstate_path = st_reinstate_path,
309 .select_path = st_select_path,
310 .start_io = st_start_io,
311 .end_io = st_end_io,
312};
313
314static int __init dm_st_init(void)
315{
316 int r = dm_register_path_selector(&st_ps);
317
318 if (r < 0)
319 DMERR("register failed %d", r);
320
321 DMINFO("version " ST_VERSION " loaded");
322
323 return r;
324}
325
326static void __exit dm_st_exit(void)
327{
328 int r = dm_unregister_path_selector(&st_ps);
329
330 if (r < 0)
331 DMERR("unregister failed %d", r);
332}
333
334module_init(dm_st_init);
335module_exit(dm_st_exit);
336
337MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
338MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
339MODULE_LICENSE("GPL");