summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Dryomov <idryomov@gmail.com>2017-06-22 13:44:05 -0400
committerIlya Dryomov <idryomov@gmail.com>2017-07-07 11:25:19 -0400
commit069f3222ca96acfe8c59937e98c401bda5475b48 (patch)
tree28db93df7b122265120acee62403a412295022bd
parent1c2e7b451b889bead46cef410a737d1767cd6f0b (diff)
crush: implement weight and id overrides for straw2
bucket_straw2_choose needs to use weights that may be different from weight_items. For instance to compensate for an uneven distribution caused by a low number of values. Or to fix the probability biais introduced by conditional probabilities (see http://tracker.ceph.com/issues/15653 for more information). We introduce a weight_set for each straw2 bucket to set the desired weight for a given item at a given position. The weight of a given item when picking the first replica (first position) may be different from the weight the second replica (second position). For instance the weight matrix for a given bucket containing items 3, 7 and 13 could be as follows: position 0 position 1 item 3 0x10000 0x100000 item 7 0x40000 0x10000 item 13 0x40000 0x10000 When crush_do_rule picks the first of two replicas (position 0), item 7, 3 are four times more likely to be choosen by bucket_straw2_choose than item 13. When choosing the second replica (position 1), item 3 is ten times more likely to be choosen than item 7, 13. By default the weight_set of each bucket exactly matches the content of item_weights for each position to ensure backward compatibility. bucket_straw2_choose compares items by using their id. The same ids are also used to index buckets and they must be unique. For each item in a bucket an array of ids can be provided for placement purposes and they are used instead of the ids. If no replacement ids are provided, the legacy behavior is preserved. Reflects ceph.git commit 19537a450fd5c5a0bb8b7830947507a76db2ceca. Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
-rw-r--r--include/linux/crush/crush.h58
-rw-r--r--include/linux/crush/mapper.h9
-rw-r--r--net/ceph/crush/mapper.c74
-rw-r--r--net/ceph/osdmap.c2
4 files changed, 119 insertions, 24 deletions
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index fbecbd089d75..d8676e56fa23 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -137,6 +137,64 @@ struct crush_bucket {
137 137
138}; 138};
139 139
140/** @ingroup API
141 *
142 * Replacement weights for each item in a bucket. The size of the
143 * array must be exactly the size of the straw2 bucket, just as the
144 * item_weights array.
145 *
146 */
147struct crush_weight_set {
148 __u32 *weights; /*!< 16.16 fixed point weights
149 in the same order as items */
150 __u32 size; /*!< size of the __weights__ array */
151};
152
153/** @ingroup API
154 *
155 * Replacement weights and ids for a given straw2 bucket, for
156 * placement purposes.
157 *
158 * When crush_do_rule() chooses the Nth item from a straw2 bucket, the
159 * replacement weights found at __weight_set[N]__ are used instead of
160 * the weights from __item_weights__. If __N__ is greater than
161 * __weight_set_size__, the weights found at __weight_set_size-1__ are
162 * used instead. For instance if __weight_set__ is:
163 *
164 * [ [ 0x10000, 0x20000 ], // position 0
165 * [ 0x20000, 0x40000 ] ] // position 1
166 *
167 * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ]
168 * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ]
169 * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ]
170 * etc.
171 *
172 */
173struct crush_choose_arg {
174 __s32 *ids; /*!< values to use instead of items */
175 __u32 ids_size; /*!< size of the __ids__ array */
176 struct crush_weight_set *weight_set; /*!< weight replacements for
177 a given position */
178 __u32 weight_set_size; /*!< size of the __weight_set__ array */
179};
180
181/** @ingroup API
182 *
183 * Replacement weights and ids for each bucket in the crushmap. The
184 * __size__ of the __args__ array must be exactly the same as the
185 * __map->max_buckets__.
186 *
187 * The __crush_choose_arg__ at index N will be used when choosing
188 * an item from the bucket __map->buckets[N]__ bucket, provided it
189 * is a straw2 bucket.
190 *
191 */
192struct crush_choose_arg_map {
193 struct crush_choose_arg *args; /*!< replacement for each bucket
194 in the crushmap */
195 __u32 size; /*!< size of the __args__ array */
196};
197
140struct crush_bucket_uniform { 198struct crush_bucket_uniform {
141 struct crush_bucket h; 199 struct crush_bucket h;
142 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */ 200 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h
index c95e19e1ff11..141edabb947e 100644
--- a/include/linux/crush/mapper.h
+++ b/include/linux/crush/mapper.h
@@ -11,11 +11,10 @@
11#include "crush.h" 11#include "crush.h"
12 12
13extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size); 13extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size);
14extern int crush_do_rule(const struct crush_map *map, 14int crush_do_rule(const struct crush_map *map,
15 int ruleno, 15 int ruleno, int x, int *result, int result_max,
16 int x, int *result, int result_max, 16 const __u32 *weight, int weight_max,
17 const __u32 *weights, int weight_max, 17 void *cwin, const struct crush_choose_arg *choose_args);
18 void *cwin);
19 18
20/* 19/*
21 * Returns the exact amount of workspace that will need to be used 20 * Returns the exact amount of workspace that will need to be used
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index b5cd8c21bfdf..0b2646a9cc50 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -302,19 +302,42 @@ static __u64 crush_ln(unsigned int xin)
302 * 302 *
303 */ 303 */
304 304
305static __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
306 const struct crush_choose_arg *arg,
307 int position)
308{
309 if (!arg || !arg->weight_set || arg->weight_set_size == 0)
310 return bucket->item_weights;
311
312 if (position >= arg->weight_set_size)
313 position = arg->weight_set_size - 1;
314 return arg->weight_set[position].weights;
315}
316
317static __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket,
318 const struct crush_choose_arg *arg)
319{
320 if (!arg || !arg->ids)
321 return bucket->h.items;
322
323 return arg->ids;
324}
325
305static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, 326static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
306 int x, int r) 327 int x, int r,
328 const struct crush_choose_arg *arg,
329 int position)
307{ 330{
308 unsigned int i, high = 0; 331 unsigned int i, high = 0;
309 unsigned int u; 332 unsigned int u;
310 unsigned int w;
311 __s64 ln, draw, high_draw = 0; 333 __s64 ln, draw, high_draw = 0;
334 __u32 *weights = get_choose_arg_weights(bucket, arg, position);
335 __s32 *ids = get_choose_arg_ids(bucket, arg);
312 336
313 for (i = 0; i < bucket->h.size; i++) { 337 for (i = 0; i < bucket->h.size; i++) {
314 w = bucket->item_weights[i]; 338 dprintk("weight 0x%x item %d\n", weights[i], ids[i]);
315 if (w) { 339 if (weights[i]) {
316 u = crush_hash32_3(bucket->h.hash, x, 340 u = crush_hash32_3(bucket->h.hash, x, ids[i], r);
317 bucket->h.items[i], r);
318 u &= 0xffff; 341 u &= 0xffff;
319 342
320 /* 343 /*
@@ -335,7 +358,7 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
335 * weight means a larger (less negative) value 358 * weight means a larger (less negative) value
336 * for draw. 359 * for draw.
337 */ 360 */
338 draw = div64_s64(ln, w); 361 draw = div64_s64(ln, weights[i]);
339 } else { 362 } else {
340 draw = S64_MIN; 363 draw = S64_MIN;
341 } 364 }
@@ -352,7 +375,9 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
352 375
353static int crush_bucket_choose(const struct crush_bucket *in, 376static int crush_bucket_choose(const struct crush_bucket *in,
354 struct crush_work_bucket *work, 377 struct crush_work_bucket *work,
355 int x, int r) 378 int x, int r,
379 const struct crush_choose_arg *arg,
380 int position)
356{ 381{
357 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); 382 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
358 BUG_ON(in->size == 0); 383 BUG_ON(in->size == 0);
@@ -374,7 +399,7 @@ static int crush_bucket_choose(const struct crush_bucket *in,
374 case CRUSH_BUCKET_STRAW2: 399 case CRUSH_BUCKET_STRAW2:
375 return bucket_straw2_choose( 400 return bucket_straw2_choose(
376 (const struct crush_bucket_straw2 *)in, 401 (const struct crush_bucket_straw2 *)in,
377 x, r); 402 x, r, arg, position);
378 default: 403 default:
379 dprintk("unknown bucket %d alg %d\n", in->id, in->alg); 404 dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
380 return in->items[0]; 405 return in->items[0];
@@ -436,7 +461,8 @@ static int crush_choose_firstn(const struct crush_map *map,
436 unsigned int vary_r, 461 unsigned int vary_r,
437 unsigned int stable, 462 unsigned int stable,
438 int *out2, 463 int *out2,
439 int parent_r) 464 int parent_r,
465 const struct crush_choose_arg *choose_args)
440{ 466{
441 int rep; 467 int rep;
442 unsigned int ftotal, flocal; 468 unsigned int ftotal, flocal;
@@ -486,7 +512,10 @@ static int crush_choose_firstn(const struct crush_map *map,
486 else 512 else
487 item = crush_bucket_choose( 513 item = crush_bucket_choose(
488 in, work->work[-1-in->id], 514 in, work->work[-1-in->id],
489 x, r); 515 x, r,
516 (choose_args ?
517 &choose_args[-1-in->id] : 0),
518 outpos);
490 if (item >= map->max_devices) { 519 if (item >= map->max_devices) {
491 dprintk(" bad item %d\n", item); 520 dprintk(" bad item %d\n", item);
492 skip_rep = 1; 521 skip_rep = 1;
@@ -543,7 +572,8 @@ static int crush_choose_firstn(const struct crush_map *map,
543 vary_r, 572 vary_r,
544 stable, 573 stable,
545 NULL, 574 NULL,
546 sub_r) <= outpos) 575 sub_r,
576 choose_args) <= outpos)
547 /* didn't get leaf */ 577 /* didn't get leaf */
548 reject = 1; 578 reject = 1;
549 } else { 579 } else {
@@ -620,7 +650,8 @@ static void crush_choose_indep(const struct crush_map *map,
620 unsigned int recurse_tries, 650 unsigned int recurse_tries,
621 int recurse_to_leaf, 651 int recurse_to_leaf,
622 int *out2, 652 int *out2,
623 int parent_r) 653 int parent_r,
654 const struct crush_choose_arg *choose_args)
624{ 655{
625 const struct crush_bucket *in = bucket; 656 const struct crush_bucket *in = bucket;
626 int endpos = outpos + left; 657 int endpos = outpos + left;
@@ -692,7 +723,10 @@ static void crush_choose_indep(const struct crush_map *map,
692 723
693 item = crush_bucket_choose( 724 item = crush_bucket_choose(
694 in, work->work[-1-in->id], 725 in, work->work[-1-in->id],
695 x, r); 726 x, r,
727 (choose_args ?
728 &choose_args[-1-in->id] : 0),
729 outpos);
696 if (item >= map->max_devices) { 730 if (item >= map->max_devices) {
697 dprintk(" bad item %d\n", item); 731 dprintk(" bad item %d\n", item);
698 out[rep] = CRUSH_ITEM_NONE; 732 out[rep] = CRUSH_ITEM_NONE;
@@ -746,7 +780,8 @@ static void crush_choose_indep(const struct crush_map *map,
746 x, 1, numrep, 0, 780 x, 1, numrep, 0,
747 out2, rep, 781 out2, rep,
748 recurse_tries, 0, 782 recurse_tries, 0,
749 0, NULL, r); 783 0, NULL, r,
784 choose_args);
750 if (out2[rep] == CRUSH_ITEM_NONE) { 785 if (out2[rep] == CRUSH_ITEM_NONE) {
751 /* placed nothing; no leaf */ 786 /* placed nothing; no leaf */
752 break; 787 break;
@@ -854,11 +889,12 @@ void crush_init_workspace(const struct crush_map *map, void *v)
854 * @weight: weight vector (for map leaves) 889 * @weight: weight vector (for map leaves)
855 * @weight_max: size of weight vector 890 * @weight_max: size of weight vector
856 * @cwin: pointer to at least crush_work_size() bytes of memory 891 * @cwin: pointer to at least crush_work_size() bytes of memory
892 * @choose_args: weights and ids for each known bucket
857 */ 893 */
858int crush_do_rule(const struct crush_map *map, 894int crush_do_rule(const struct crush_map *map,
859 int ruleno, int x, int *result, int result_max, 895 int ruleno, int x, int *result, int result_max,
860 const __u32 *weight, int weight_max, 896 const __u32 *weight, int weight_max,
861 void *cwin) 897 void *cwin, const struct crush_choose_arg *choose_args)
862{ 898{
863 int result_len; 899 int result_len;
864 struct crush_work *cw = cwin; 900 struct crush_work *cw = cwin;
@@ -1013,7 +1049,8 @@ int crush_do_rule(const struct crush_map *map,
1013 vary_r, 1049 vary_r,
1014 stable, 1050 stable,
1015 c+osize, 1051 c+osize,
1016 0); 1052 0,
1053 choose_args);
1017 } else { 1054 } else {
1018 out_size = ((numrep < (result_max-osize)) ? 1055 out_size = ((numrep < (result_max-osize)) ?
1019 numrep : (result_max-osize)); 1056 numrep : (result_max-osize));
@@ -1030,7 +1067,8 @@ int crush_do_rule(const struct crush_map *map,
1030 choose_leaf_tries : 1, 1067 choose_leaf_tries : 1,
1031 recurse_to_leaf, 1068 recurse_to_leaf,
1032 c+osize, 1069 c+osize,
1033 0); 1070 0,
1071 choose_args);
1034 osize += out_size; 1072 osize += out_size;
1035 } 1073 }
1036 } 1074 }
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 93baa69407c5..9da0ee61aca5 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -2111,7 +2111,7 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
2111 2111
2112 mutex_lock(&map->crush_workspace_mutex); 2112 mutex_lock(&map->crush_workspace_mutex);
2113 r = crush_do_rule(map->crush, ruleno, x, result, result_max, 2113 r = crush_do_rule(map->crush, ruleno, x, result, result_max,
2114 weight, weight_max, map->crush_workspace); 2114 weight, weight_max, map->crush_workspace, NULL);
2115 mutex_unlock(&map->crush_workspace_mutex); 2115 mutex_unlock(&map->crush_workspace_mutex);
2116 2116
2117 return r; 2117 return r;