aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph/crush/mapper.c
diff options
context:
space:
mode:
authorSage Weil <sage@newdream.net>2009-10-06 14:31:11 -0400
committerSage Weil <sage@newdream.net>2009-10-06 14:31:11 -0400
commit5ecc0a0f8128b1876e8614638deaed49cc8b174c (patch)
tree6b1511b0e3ec2a349a24faebc375548b14ee137a /fs/ceph/crush/mapper.c
parentf24e9980eb860d8600cbe5ef3d2fd9295320d229 (diff)
ceph: CRUSH mapping algorithm
CRUSH is a pseudorandom data distribution function designed to map inputs onto a dynamic hierarchy of devices, while minimizing the extent to which inputs are remapped when the devices are added or removed. It includes some features that are specifically useful for storage, most notably the ability to map each input onto a set of N devices that are separated across administrator-defined failure domains. CRUSH is used to distribute data across the cluster of Ceph storage nodes. More information about CRUSH can be found in this paper: http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf Signed-off-by: Sage Weil <sage@newdream.net>
Diffstat (limited to 'fs/ceph/crush/mapper.c')
-rw-r--r--fs/ceph/crush/mapper.c589
1 files changed, 589 insertions, 0 deletions
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..0f0730c62695
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,589 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(x, bucket->h.items[i], r,
142 bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(x, n, r, bucket->h.id) * (__u64)w;
202 t = t >> 32;
203
204 /* descend to the left or right? */
205 l = left(n);
206 if (t < bucket->node_weights[l])
207 n = l;
208 else
209 n = right(n);
210 }
211
212 return bucket->h.items[n >> 1];
213}
214
215
216/* straw */
217
218static int bucket_straw_choose(struct crush_bucket_straw *bucket,
219 int x, int r)
220{
221 int i;
222 int high = 0;
223 __u64 high_draw = 0;
224 __u64 draw;
225
226 for (i = 0; i < bucket->h.size; i++) {
227 draw = crush_hash32_3(x, bucket->h.items[i], r);
228 draw &= 0xffff;
229 draw *= bucket->straws[i];
230 if (i == 0 || draw > high_draw) {
231 high = i;
232 high_draw = draw;
233 }
234 }
235 return bucket->h.items[high];
236}
237
238static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
239{
240 dprintk("choose %d x=%d r=%d\n", in->id, x, r);
241 switch (in->alg) {
242 case CRUSH_BUCKET_UNIFORM:
243 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
244 x, r);
245 case CRUSH_BUCKET_LIST:
246 return bucket_list_choose((struct crush_bucket_list *)in,
247 x, r);
248 case CRUSH_BUCKET_TREE:
249 return bucket_tree_choose((struct crush_bucket_tree *)in,
250 x, r);
251 case CRUSH_BUCKET_STRAW:
252 return bucket_straw_choose((struct crush_bucket_straw *)in,
253 x, r);
254 default:
255 BUG_ON(1);
256/* return in->items[0] */;
257 }
258}
259
260/*
261 * true if device is marked "out" (failed, fully offloaded)
262 * of the cluster
263 */
264static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
265{
266 if (weight[item] >= 0x1000)
267 return 0;
268 if (weight[item] == 0)
269 return 1;
270 if ((crush_hash32_2(x, item) & 0xffff) < weight[item])
271 return 0;
272 return 1;
273}
274
275/**
276 * crush_choose - choose numrep distinct items of given type
277 * @map: the crush_map
278 * @bucket: the bucket we are choose an item from
279 * @x: crush input value
280 * @numrep: the number of items to choose
281 * @type: the type of item to choose
282 * @out: pointer to output vector
283 * @outpos: our position in that vector
284 * @firstn: true if choosing "first n" items, false if choosing "indep"
285 * @recurse_to_leaf: true if we want one device under each item of given type
286 * @out2: second output vector for leaf items (if @recurse_to_leaf)
287 */
288static int crush_choose(struct crush_map *map,
289 struct crush_bucket *bucket,
290 __u32 *weight,
291 int x, int numrep, int type,
292 int *out, int outpos,
293 int firstn, int recurse_to_leaf,
294 int *out2)
295{
296 int rep;
297 int ftotal, flocal;
298 int retry_descent, retry_bucket, skip_rep;
299 struct crush_bucket *in = bucket;
300 int r;
301 int i;
302 int item;
303 int itemtype;
304 int collide, reject;
305 const int orig_tries = 5; /* attempts before we fall back to search */
306 dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
307
308 for (rep = outpos; rep < numrep; rep++) {
309 /* keep trying until we get a non-out, non-colliding item */
310 ftotal = 0;
311 skip_rep = 0;
312 do {
313 retry_descent = 0;
314 in = bucket; /* initial bucket */
315
316 /* choose through intervening buckets */
317 flocal = 0;
318 do {
319 retry_bucket = 0;
320 r = rep;
321 if (in->alg == CRUSH_BUCKET_UNIFORM) {
322 /* be careful */
323 if (firstn || numrep >= in->size)
324 /* r' = r + f_total */
325 r += ftotal;
326 else if (in->size % numrep == 0)
327 /* r'=r+(n+1)*f_local */
328 r += (numrep+1) *
329 (flocal+ftotal);
330 else
331 /* r' = r + n*f_local */
332 r += numrep * (flocal+ftotal);
333 } else {
334 if (firstn)
335 /* r' = r + f_total */
336 r += ftotal;
337 else
338 /* r' = r + n*f_local */
339 r += numrep * (flocal+ftotal);
340 }
341
342 /* bucket choose */
343 if (flocal >= (in->size>>1) &&
344 flocal > orig_tries)
345 item = bucket_perm_choose(in, x, r);
346 else
347 item = crush_bucket_choose(in, x, r);
348 BUG_ON(item >= map->max_devices);
349
350 /* desired type? */
351 if (item < 0)
352 itemtype = map->buckets[-1-item]->type;
353 else
354 itemtype = 0;
355 dprintk(" item %d type %d\n", item, itemtype);
356
357 /* keep going? */
358 if (itemtype != type) {
359 BUG_ON(item >= 0 ||
360 (-1-item) >= map->max_buckets);
361 in = map->buckets[-1-item];
362 continue;
363 }
364
365 /* collision? */
366 collide = 0;
367 for (i = 0; i < outpos; i++) {
368 if (out[i] == item) {
369 collide = 1;
370 break;
371 }
372 }
373
374 if (recurse_to_leaf &&
375 item < 0 &&
376 crush_choose(map, map->buckets[-1-item],
377 weight,
378 x, outpos+1, 0,
379 out2, outpos,
380 firstn, 0, NULL) <= outpos) {
381 reject = 1;
382 } else {
383 /* out? */
384 if (itemtype == 0)
385 reject = is_out(map, weight,
386 item, x);
387 else
388 reject = 0;
389 }
390
391 if (reject || collide) {
392 ftotal++;
393 flocal++;
394
395 if (collide && flocal < 3)
396 /* retry locally a few times */
397 retry_bucket = 1;
398 else if (flocal < in->size + orig_tries)
399 /* exhaustive bucket search */
400 retry_bucket = 1;
401 else if (ftotal < 20)
402 /* then retry descent */
403 retry_descent = 1;
404 else
405 /* else give up */
406 skip_rep = 1;
407 dprintk(" reject %d collide %d "
408 "ftotal %d flocal %d\n",
409 reject, collide, ftotal,
410 flocal);
411 }
412 } while (retry_bucket);
413 } while (retry_descent);
414
415 if (skip_rep) {
416 dprintk("skip rep\n");
417 continue;
418 }
419
420 dprintk("choose got %d\n", item);
421 out[outpos] = item;
422 outpos++;
423 }
424
425 dprintk("choose returns %d\n", outpos);
426 return outpos;
427}
428
429
430/**
431 * crush_do_rule - calculate a mapping with the given input and rule
432 * @map: the crush_map
433 * @ruleno: the rule id
434 * @x: hash input
435 * @result: pointer to result vector
436 * @result_max: maximum result size
437 * @force: force initial replica choice; -1 for none
438 */
439int crush_do_rule(struct crush_map *map,
440 int ruleno, int x, int *result, int result_max,
441 int force, __u32 *weight)
442{
443 int result_len;
444 int force_context[CRUSH_MAX_DEPTH];
445 int force_pos = -1;
446 int a[CRUSH_MAX_SET];
447 int b[CRUSH_MAX_SET];
448 int c[CRUSH_MAX_SET];
449 int recurse_to_leaf;
450 int *w;
451 int wsize = 0;
452 int *o;
453 int osize;
454 int *tmp;
455 struct crush_rule *rule;
456 int step;
457 int i, j;
458 int numrep;
459 int firstn;
460 int rc = -1;
461
462 BUG_ON(ruleno >= map->max_rules);
463
464 rule = map->rules[ruleno];
465 result_len = 0;
466 w = a;
467 o = b;
468
469 /*
470 * determine hierarchical context of force, if any. note
471 * that this may or may not correspond to the specific types
472 * referenced by the crush rule.
473 */
474 if (force >= 0) {
475 if (force >= map->max_devices ||
476 map->device_parents[force] == 0) {
477 /*dprintk("CRUSH: forcefed device dne\n");*/
478 rc = -1; /* force fed device dne */
479 goto out;
480 }
481 if (!is_out(map, weight, force, x)) {
482 while (1) {
483 force_context[++force_pos] = force;
484 if (force >= 0)
485 force = map->device_parents[force];
486 else
487 force = map->bucket_parents[-1-force];
488 if (force == 0)
489 break;
490 }
491 }
492 }
493
494 for (step = 0; step < rule->len; step++) {
495 firstn = 0;
496 switch (rule->steps[step].op) {
497 case CRUSH_RULE_TAKE:
498 w[0] = rule->steps[step].arg1;
499 if (force_pos >= 0) {
500 BUG_ON(force_context[force_pos] != w[0]);
501 force_pos--;
502 }
503 wsize = 1;
504 break;
505
506 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
507 case CRUSH_RULE_CHOOSE_FIRSTN:
508 firstn = 1;
509 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
510 case CRUSH_RULE_CHOOSE_INDEP:
511 BUG_ON(wsize == 0);
512
513 recurse_to_leaf =
514 rule->steps[step].op ==
515 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
516 rule->steps[step].op ==
517 CRUSH_RULE_CHOOSE_LEAF_INDEP;
518
519 /* reset output */
520 osize = 0;
521
522 for (i = 0; i < wsize; i++) {
523 /*
524 * see CRUSH_N, CRUSH_N_MINUS macros.
525 * basically, numrep <= 0 means relative to
526 * the provided result_max
527 */
528 numrep = rule->steps[step].arg1;
529 if (numrep <= 0) {
530 numrep += result_max;
531 if (numrep <= 0)
532 continue;
533 }
534 j = 0;
535 if (osize == 0 && force_pos >= 0) {
536 /* skip any intermediate types */
537 while (force_pos &&
538 force_context[force_pos] < 0 &&
539 rule->steps[step].arg2 !=
540 map->buckets[-1 -
541 force_context[force_pos]]->type)
542 force_pos--;
543 o[osize] = force_context[force_pos];
544 if (recurse_to_leaf)
545 c[osize] = force_context[0];
546 j++;
547 force_pos--;
548 }
549 osize += crush_choose(map,
550 map->buckets[-1-w[i]],
551 weight,
552 x, numrep,
553 rule->steps[step].arg2,
554 o+osize, j,
555 firstn,
556 recurse_to_leaf, c+osize);
557 }
558
559 if (recurse_to_leaf)
560 /* copy final _leaf_ values to output set */
561 memcpy(o, c, osize*sizeof(*o));
562
563 /* swap t and w arrays */
564 tmp = o;
565 o = w;
566 w = tmp;
567 wsize = osize;
568 break;
569
570
571 case CRUSH_RULE_EMIT:
572 for (i = 0; i < wsize && result_len < result_max; i++) {
573 result[result_len] = w[i];
574 result_len++;
575 }
576 wsize = 0;
577 break;
578
579 default:
580 BUG_ON(1);
581 }
582 }
583 rc = result_len;
584
585out:
586 return rc;
587}
588
589