diff options
| -rw-r--r-- | Documentation/device-mapper/dm-service-time.txt | 91 | ||||
| -rw-r--r-- | drivers/md/Kconfig | 10 | ||||
| -rw-r--r-- | drivers/md/Makefile | 1 | ||||
| -rw-r--r-- | drivers/md/dm-service-time.c | 339 |
4 files changed, 441 insertions, 0 deletions
diff --git a/Documentation/device-mapper/dm-service-time.txt b/Documentation/device-mapper/dm-service-time.txt new file mode 100644 index 000000000000..7d00668e97bb --- /dev/null +++ b/Documentation/device-mapper/dm-service-time.txt | |||
| @@ -0,0 +1,91 @@ | |||
| 1 | dm-service-time | ||
| 2 | =============== | ||
| 3 | |||
| 4 | dm-service-time is a path selector module for device-mapper targets, | ||
| 5 | which selects a path with the shortest estimated service time for | ||
| 6 | the incoming I/O. | ||
| 7 | |||
| 8 | The service time for each path is estimated by dividing the total size | ||
| 9 | of in-flight I/Os on a path with the performance value of the path. | ||
| 10 | The performance value is a relative throughput value among all paths | ||
| 11 | in a path-group, and it can be specified as a table argument. | ||
| 12 | |||
| 13 | The path selector name is 'service-time'. | ||
| 14 | |||
| 15 | Table parameters for each path: [<repeat_count> [<relative_throughput>]] | ||
| 16 | <repeat_count>: The number of I/Os to dispatch using the selected | ||
| 17 | path before switching to the next path. | ||
| 18 | If not given, internal default is used. To check | ||
| 19 | the default value, see the activated table. | ||
| 20 | <relative_throughput>: The relative throughput value of the path | ||
| 21 | among all paths in the path-group. | ||
| 22 | The valid range is 0-100. | ||
| 23 | If not given, minimum value '1' is used. | ||
| 24 | If '0' is given, the path isn't selected while | ||
| 25 | other paths having a positive value are available. | ||
| 26 | |||
| 27 | Status for each path: <status> <fail-count> <in-flight-size> \ | ||
| 28 | <relative_throughput> | ||
| 29 | <status>: 'A' if the path is active, 'F' if the path is failed. | ||
| 30 | <fail-count>: The number of path failures. | ||
| 31 | <in-flight-size>: The size of in-flight I/Os on the path. | ||
| 32 | <relative_throughput>: The relative throughput value of the path | ||
| 33 | among all paths in the path-group. | ||
| 34 | |||
| 35 | |||
| 36 | Algorithm | ||
| 37 | ========= | ||
| 38 | |||
| 39 | dm-service-time adds the I/O size to 'in-flight-size' when the I/O is | ||
| 40 | dispatched and substracts when completed. | ||
| 41 | Basically, dm-service-time selects a path having minimum service time | ||
| 42 | which is calculated by: | ||
| 43 | |||
| 44 | ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput' | ||
| 45 | |||
| 46 | However, some optimizations below are used to reduce the calculation | ||
| 47 | as much as possible. | ||
| 48 | |||
| 49 | 1. If the paths have the same 'relative_throughput', skip | ||
| 50 | the division and just compare the 'in-flight-size'. | ||
| 51 | |||
| 52 | 2. If the paths have the same 'in-flight-size', skip the division | ||
| 53 | and just compare the 'relative_throughput'. | ||
| 54 | |||
| 55 | 3. If some paths have non-zero 'relative_throughput' and others | ||
| 56 | have zero 'relative_throughput', ignore those paths with zero | ||
| 57 | 'relative_throughput'. | ||
| 58 | |||
| 59 | If such optimizations can't be applied, calculate service time, and | ||
| 60 | compare service time. | ||
| 61 | If calculated service time is equal, the path having maximum | ||
| 62 | 'relative_throughput' may be better. So compare 'relative_throughput' | ||
| 63 | then. | ||
| 64 | |||
| 65 | |||
| 66 | Examples | ||
| 67 | ======== | ||
| 68 | In case that 2 paths (sda and sdb) are used with repeat_count == 128 | ||
| 69 | and sda has an average throughput 1GB/s and sdb has 4GB/s, | ||
| 70 | 'relative_throughput' value may be '1' for sda and '4' for sdb. | ||
| 71 | |||
| 72 | # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \ | ||
| 73 | dmsetup create test | ||
| 74 | # | ||
| 75 | # dmsetup table | ||
| 76 | test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4 | ||
| 77 | # | ||
| 78 | # dmsetup status | ||
| 79 | test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4 | ||
| 80 | |||
| 81 | |||
| 82 | Or '2' for sda and '8' for sdb would be also true. | ||
| 83 | |||
| 84 | # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \ | ||
| 85 | dmsetup create test | ||
| 86 | # | ||
| 87 | # dmsetup table | ||
| 88 | test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8 | ||
| 89 | # | ||
| 90 | # dmsetup status | ||
| 91 | test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8 | ||
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 3b311d273346..09f93fa68912 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
| @@ -258,6 +258,16 @@ config DM_MULTIPATH_QL | |||
| 258 | 258 | ||
| 259 | If unsure, say N. | 259 | If unsure, say N. |
| 260 | 260 | ||
| 261 | config DM_MULTIPATH_ST | ||
| 262 | tristate "I/O Path Selector based on the service time" | ||
| 263 | depends on DM_MULTIPATH | ||
| 264 | ---help--- | ||
| 265 | This path selector is a dynamic load balancer which selects | ||
| 266 | the path expected to complete the incoming I/O in the shortest | ||
| 267 | time. | ||
| 268 | |||
| 269 | If unsure, say N. | ||
| 270 | |||
| 261 | config DM_DELAY | 271 | config DM_DELAY |
| 262 | tristate "I/O delaying target (EXPERIMENTAL)" | 272 | tristate "I/O delaying target (EXPERIMENTAL)" |
| 263 | depends on BLK_DEV_DM && EXPERIMENTAL | 273 | depends on BLK_DEV_DM && EXPERIMENTAL |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index ff9f545dd516..dade52f60733 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
| @@ -37,6 +37,7 @@ obj-$(CONFIG_DM_CRYPT) += dm-crypt.o | |||
| 37 | obj-$(CONFIG_DM_DELAY) += dm-delay.o | 37 | obj-$(CONFIG_DM_DELAY) += dm-delay.o |
| 38 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o | 38 | obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o |
| 39 | obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o | 39 | obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o |
| 40 | obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o | ||
| 40 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o | 41 | obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o |
| 41 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o | 42 | obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o |
| 42 | obj-$(CONFIG_DM_ZERO) += dm-zero.o | 43 | obj-$(CONFIG_DM_ZERO) += dm-zero.o |
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c new file mode 100644 index 000000000000..cfa668f46c40 --- /dev/null +++ b/drivers/md/dm-service-time.c | |||
| @@ -0,0 +1,339 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. | ||
| 3 | * | ||
| 4 | * Module Author: Kiyoshi Ueda | ||
| 5 | * | ||
| 6 | * This file is released under the GPL. | ||
| 7 | * | ||
| 8 | * Throughput oriented path selector. | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include "dm.h" | ||
| 12 | #include "dm-path-selector.h" | ||
| 13 | |||
| 14 | #define DM_MSG_PREFIX "multipath service-time" | ||
| 15 | #define ST_MIN_IO 1 | ||
| 16 | #define ST_MAX_RELATIVE_THROUGHPUT 100 | ||
| 17 | #define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 | ||
| 18 | #define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) | ||
| 19 | #define ST_VERSION "0.2.0" | ||
| 20 | |||
| 21 | struct selector { | ||
| 22 | struct list_head valid_paths; | ||
| 23 | struct list_head failed_paths; | ||
| 24 | }; | ||
| 25 | |||
| 26 | struct path_info { | ||
| 27 | struct list_head list; | ||
| 28 | struct dm_path *path; | ||
| 29 | unsigned repeat_count; | ||
| 30 | unsigned relative_throughput; | ||
| 31 | atomic_t in_flight_size; /* Total size of in-flight I/Os */ | ||
| 32 | }; | ||
| 33 | |||
| 34 | static struct selector *alloc_selector(void) | ||
| 35 | { | ||
| 36 | struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
| 37 | |||
| 38 | if (s) { | ||
| 39 | INIT_LIST_HEAD(&s->valid_paths); | ||
| 40 | INIT_LIST_HEAD(&s->failed_paths); | ||
| 41 | } | ||
| 42 | |||
| 43 | return s; | ||
| 44 | } | ||
| 45 | |||
| 46 | static int st_create(struct path_selector *ps, unsigned argc, char **argv) | ||
| 47 | { | ||
| 48 | struct selector *s = alloc_selector(); | ||
| 49 | |||
| 50 | if (!s) | ||
| 51 | return -ENOMEM; | ||
| 52 | |||
| 53 | ps->context = s; | ||
| 54 | return 0; | ||
| 55 | } | ||
| 56 | |||
| 57 | static void free_paths(struct list_head *paths) | ||
| 58 | { | ||
| 59 | struct path_info *pi, *next; | ||
| 60 | |||
| 61 | list_for_each_entry_safe(pi, next, paths, list) { | ||
| 62 | list_del(&pi->list); | ||
| 63 | kfree(pi); | ||
| 64 | } | ||
| 65 | } | ||
| 66 | |||
| 67 | static void st_destroy(struct path_selector *ps) | ||
| 68 | { | ||
| 69 | struct selector *s = ps->context; | ||
| 70 | |||
| 71 | free_paths(&s->valid_paths); | ||
| 72 | free_paths(&s->failed_paths); | ||
| 73 | kfree(s); | ||
| 74 | ps->context = NULL; | ||
| 75 | } | ||
| 76 | |||
| 77 | static int st_status(struct path_selector *ps, struct dm_path *path, | ||
| 78 | status_type_t type, char *result, unsigned maxlen) | ||
| 79 | { | ||
| 80 | unsigned sz = 0; | ||
| 81 | struct path_info *pi; | ||
| 82 | |||
| 83 | if (!path) | ||
| 84 | DMEMIT("0 "); | ||
| 85 | else { | ||
| 86 | pi = path->pscontext; | ||
| 87 | |||
| 88 | switch (type) { | ||
| 89 | case STATUSTYPE_INFO: | ||
| 90 | DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), | ||
| 91 | pi->relative_throughput); | ||
| 92 | break; | ||
| 93 | case STATUSTYPE_TABLE: | ||
| 94 | DMEMIT("%u %u ", pi->repeat_count, | ||
| 95 | pi->relative_throughput); | ||
| 96 | break; | ||
| 97 | } | ||
| 98 | } | ||
| 99 | |||
| 100 | return sz; | ||
| 101 | } | ||
| 102 | |||
| 103 | static int st_add_path(struct path_selector *ps, struct dm_path *path, | ||
| 104 | int argc, char **argv, char **error) | ||
| 105 | { | ||
| 106 | struct selector *s = ps->context; | ||
| 107 | struct path_info *pi; | ||
| 108 | unsigned repeat_count = ST_MIN_IO; | ||
| 109 | unsigned relative_throughput = 1; | ||
| 110 | |||
| 111 | /* | ||
| 112 | * Arguments: [<repeat_count> [<relative_throughput>]] | ||
| 113 | * <repeat_count>: The number of I/Os before switching path. | ||
| 114 | * If not given, default (ST_MIN_IO) is used. | ||
| 115 | * <relative_throughput>: The relative throughput value of | ||
| 116 | * the path among all paths in the path-group. | ||
| 117 | * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT> | ||
| 118 | * If not given, minimum value '1' is used. | ||
| 119 | * If '0' is given, the path isn't selected while | ||
| 120 | * other paths having a positive value are | ||
| 121 | * available. | ||
| 122 | */ | ||
| 123 | if (argc > 2) { | ||
| 124 | *error = "service-time ps: incorrect number of arguments"; | ||
| 125 | return -EINVAL; | ||
| 126 | } | ||
| 127 | |||
| 128 | if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | ||
| 129 | *error = "service-time ps: invalid repeat count"; | ||
| 130 | return -EINVAL; | ||
| 131 | } | ||
| 132 | |||
| 133 | if ((argc == 2) && | ||
| 134 | (sscanf(argv[1], "%u", &relative_throughput) != 1 || | ||
| 135 | relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { | ||
| 136 | *error = "service-time ps: invalid relative_throughput value"; | ||
| 137 | return -EINVAL; | ||
| 138 | } | ||
| 139 | |||
| 140 | /* allocate the path */ | ||
| 141 | pi = kmalloc(sizeof(*pi), GFP_KERNEL); | ||
| 142 | if (!pi) { | ||
| 143 | *error = "service-time ps: Error allocating path context"; | ||
| 144 | return -ENOMEM; | ||
| 145 | } | ||
| 146 | |||
| 147 | pi->path = path; | ||
| 148 | pi->repeat_count = repeat_count; | ||
| 149 | pi->relative_throughput = relative_throughput; | ||
| 150 | atomic_set(&pi->in_flight_size, 0); | ||
| 151 | |||
| 152 | path->pscontext = pi; | ||
| 153 | |||
| 154 | list_add_tail(&pi->list, &s->valid_paths); | ||
| 155 | |||
| 156 | return 0; | ||
| 157 | } | ||
| 158 | |||
| 159 | static void st_fail_path(struct path_selector *ps, struct dm_path *path) | ||
| 160 | { | ||
| 161 | struct selector *s = ps->context; | ||
| 162 | struct path_info *pi = path->pscontext; | ||
| 163 | |||
| 164 | list_move(&pi->list, &s->failed_paths); | ||
| 165 | } | ||
| 166 | |||
| 167 | static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) | ||
| 168 | { | ||
| 169 | struct selector *s = ps->context; | ||
| 170 | struct path_info *pi = path->pscontext; | ||
| 171 | |||
| 172 | list_move_tail(&pi->list, &s->valid_paths); | ||
| 173 | |||
| 174 | return 0; | ||
| 175 | } | ||
| 176 | |||
| 177 | /* | ||
| 178 | * Compare the estimated service time of 2 paths, pi1 and pi2, | ||
| 179 | * for the incoming I/O. | ||
| 180 | * | ||
| 181 | * Returns: | ||
| 182 | * < 0 : pi1 is better | ||
| 183 | * 0 : no difference between pi1 and pi2 | ||
| 184 | * > 0 : pi2 is better | ||
| 185 | * | ||
| 186 | * Description: | ||
| 187 | * Basically, the service time is estimated by: | ||
| 188 | * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' | ||
| 189 | * To reduce the calculation, some optimizations are made. | ||
| 190 | * (See comments inline) | ||
| 191 | */ | ||
| 192 | static int st_compare_load(struct path_info *pi1, struct path_info *pi2, | ||
| 193 | size_t incoming) | ||
| 194 | { | ||
| 195 | size_t sz1, sz2, st1, st2; | ||
| 196 | |||
| 197 | sz1 = atomic_read(&pi1->in_flight_size); | ||
| 198 | sz2 = atomic_read(&pi2->in_flight_size); | ||
| 199 | |||
| 200 | /* | ||
| 201 | * Case 1: Both have same throughput value. Choose less loaded path. | ||
| 202 | */ | ||
| 203 | if (pi1->relative_throughput == pi2->relative_throughput) | ||
| 204 | return sz1 - sz2; | ||
| 205 | |||
| 206 | /* | ||
| 207 | * Case 2a: Both have same load. Choose higher throughput path. | ||
| 208 | * Case 2b: One path has no throughput value. Choose the other one. | ||
| 209 | */ | ||
| 210 | if (sz1 == sz2 || | ||
| 211 | !pi1->relative_throughput || !pi2->relative_throughput) | ||
| 212 | return pi2->relative_throughput - pi1->relative_throughput; | ||
| 213 | |||
| 214 | /* | ||
| 215 | * Case 3: Calculate service time. Choose faster path. | ||
| 216 | * Service time using pi1: | ||
| 217 | * st1 = (sz1 + incoming) / pi1->relative_throughput | ||
| 218 | * Service time using pi2: | ||
| 219 | * st2 = (sz2 + incoming) / pi2->relative_throughput | ||
| 220 | * | ||
| 221 | * To avoid the division, transform the expression to use | ||
| 222 | * multiplication. | ||
| 223 | * Because ->relative_throughput > 0 here, if st1 < st2, | ||
| 224 | * the expressions below are the same meaning: | ||
| 225 | * (sz1 + incoming) / pi1->relative_throughput < | ||
| 226 | * (sz2 + incoming) / pi2->relative_throughput | ||
| 227 | * (sz1 + incoming) * pi2->relative_throughput < | ||
| 228 | * (sz2 + incoming) * pi1->relative_throughput | ||
| 229 | * So use the later one. | ||
| 230 | */ | ||
| 231 | sz1 += incoming; | ||
| 232 | sz2 += incoming; | ||
| 233 | if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || | ||
| 234 | sz2 >= ST_MAX_INFLIGHT_SIZE)) { | ||
| 235 | /* | ||
| 236 | * Size may be too big for multiplying pi->relative_throughput | ||
| 237 | * and overflow. | ||
| 238 | * To avoid the overflow and mis-selection, shift down both. | ||
| 239 | */ | ||
| 240 | sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; | ||
| 241 | sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; | ||
| 242 | } | ||
| 243 | st1 = sz1 * pi2->relative_throughput; | ||
| 244 | st2 = sz2 * pi1->relative_throughput; | ||
| 245 | if (st1 != st2) | ||
| 246 | return st1 - st2; | ||
| 247 | |||
| 248 | /* | ||
| 249 | * Case 4: Service time is equal. Choose higher throughput path. | ||
| 250 | */ | ||
| 251 | return pi2->relative_throughput - pi1->relative_throughput; | ||
| 252 | } | ||
| 253 | |||
| 254 | static struct dm_path *st_select_path(struct path_selector *ps, | ||
| 255 | unsigned *repeat_count, size_t nr_bytes) | ||
| 256 | { | ||
| 257 | struct selector *s = ps->context; | ||
| 258 | struct path_info *pi = NULL, *best = NULL; | ||
| 259 | |||
| 260 | if (list_empty(&s->valid_paths)) | ||
| 261 | return NULL; | ||
| 262 | |||
| 263 | /* Change preferred (first in list) path to evenly balance. */ | ||
| 264 | list_move_tail(s->valid_paths.next, &s->valid_paths); | ||
| 265 | |||
| 266 | list_for_each_entry(pi, &s->valid_paths, list) | ||
| 267 | if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) | ||
| 268 | best = pi; | ||
| 269 | |||
| 270 | if (!best) | ||
| 271 | return NULL; | ||
| 272 | |||
| 273 | *repeat_count = best->repeat_count; | ||
| 274 | |||
| 275 | return best->path; | ||
| 276 | } | ||
| 277 | |||
| 278 | static int st_start_io(struct path_selector *ps, struct dm_path *path, | ||
| 279 | size_t nr_bytes) | ||
| 280 | { | ||
| 281 | struct path_info *pi = path->pscontext; | ||
| 282 | |||
| 283 | atomic_add(nr_bytes, &pi->in_flight_size); | ||
| 284 | |||
| 285 | return 0; | ||
| 286 | } | ||
| 287 | |||
| 288 | static int st_end_io(struct path_selector *ps, struct dm_path *path, | ||
| 289 | size_t nr_bytes) | ||
| 290 | { | ||
| 291 | struct path_info *pi = path->pscontext; | ||
| 292 | |||
| 293 | atomic_sub(nr_bytes, &pi->in_flight_size); | ||
| 294 | |||
| 295 | return 0; | ||
| 296 | } | ||
| 297 | |||
| 298 | static struct path_selector_type st_ps = { | ||
| 299 | .name = "service-time", | ||
| 300 | .module = THIS_MODULE, | ||
| 301 | .table_args = 2, | ||
| 302 | .info_args = 2, | ||
| 303 | .create = st_create, | ||
| 304 | .destroy = st_destroy, | ||
| 305 | .status = st_status, | ||
| 306 | .add_path = st_add_path, | ||
| 307 | .fail_path = st_fail_path, | ||
| 308 | .reinstate_path = st_reinstate_path, | ||
| 309 | .select_path = st_select_path, | ||
| 310 | .start_io = st_start_io, | ||
| 311 | .end_io = st_end_io, | ||
| 312 | }; | ||
| 313 | |||
| 314 | static int __init dm_st_init(void) | ||
| 315 | { | ||
| 316 | int r = dm_register_path_selector(&st_ps); | ||
| 317 | |||
| 318 | if (r < 0) | ||
| 319 | DMERR("register failed %d", r); | ||
| 320 | |||
| 321 | DMINFO("version " ST_VERSION " loaded"); | ||
| 322 | |||
| 323 | return r; | ||
| 324 | } | ||
| 325 | |||
| 326 | static void __exit dm_st_exit(void) | ||
| 327 | { | ||
| 328 | int r = dm_unregister_path_selector(&st_ps); | ||
| 329 | |||
| 330 | if (r < 0) | ||
| 331 | DMERR("unregister failed %d", r); | ||
| 332 | } | ||
| 333 | |||
| 334 | module_init(dm_st_init); | ||
| 335 | module_exit(dm_st_exit); | ||
| 336 | |||
| 337 | MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); | ||
| 338 | MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>"); | ||
| 339 | MODULE_LICENSE("GPL"); | ||
