aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSage Weil <sage@newdream.net>2009-10-06 14:31:11 -0400
committerSage Weil <sage@newdream.net>2009-10-06 14:31:11 -0400
commit5ecc0a0f8128b1876e8614638deaed49cc8b174c (patch)
tree6b1511b0e3ec2a349a24faebc375548b14ee137a
parentf24e9980eb860d8600cbe5ef3d2fd9295320d229 (diff)
ceph: CRUSH mapping algorithm
CRUSH is a pseudorandom data distribution function designed to map inputs onto a dynamic hierarchy of devices, while minimizing the extent to which inputs are remapped when the devices are added or removed. It includes some features that are specifically useful for storage, most notably the ability to map each input onto a set of N devices that are separated across administrator-defined failure domains. CRUSH is used to distribute data across the cluster of Ceph storage nodes. More information about CRUSH can be found in this paper: http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf Signed-off-by: Sage Weil <sage@newdream.net>
-rw-r--r--fs/ceph/crush/crush.c140
-rw-r--r--fs/ceph/crush/crush.h188
-rw-r--r--fs/ceph/crush/hash.h90
-rw-r--r--fs/ceph/crush/mapper.c589
-rw-r--r--fs/ceph/crush/mapper.h20
5 files changed, 1027 insertions, 0 deletions
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..13755cdc4fb3
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,140 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include "crush.h"
12
13/**
14 * crush_get_bucket_item_weight - Get weight of an item in given bucket
15 * @b: bucket pointer
16 * @p: item index in bucket
17 */
18int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
19{
20 if (p >= b->size)
21 return 0;
22
23 switch (b->alg) {
24 case CRUSH_BUCKET_UNIFORM:
25 return ((struct crush_bucket_uniform *)b)->item_weight;
26 case CRUSH_BUCKET_LIST:
27 return ((struct crush_bucket_list *)b)->item_weights[p];
28 case CRUSH_BUCKET_TREE:
29 if (p & 1)
30 return ((struct crush_bucket_tree *)b)->node_weights[p];
31 return 0;
32 case CRUSH_BUCKET_STRAW:
33 return ((struct crush_bucket_straw *)b)->item_weights[p];
34 }
35 return 0;
36}
37
38/**
39 * crush_calc_parents - Calculate parent vectors for the given crush map.
40 * @map: crush_map pointer
41 */
42void crush_calc_parents(struct crush_map *map)
43{
44 int i, b, c;
45
46 for (b = 0; b < map->max_buckets; b++) {
47 if (map->buckets[b] == NULL)
48 continue;
49 for (i = 0; i < map->buckets[b]->size; i++) {
50 c = map->buckets[b]->items[i];
51 BUG_ON(c >= map->max_devices ||
52 c < -map->max_buckets);
53 if (c >= 0)
54 map->device_parents[c] = map->buckets[b]->id;
55 else
56 map->bucket_parents[-1-c] = map->buckets[b]->id;
57 }
58 }
59}
60
61void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
62{
63 kfree(b->h.perm);
64 kfree(b->h.items);
65 kfree(b);
66}
67
68void crush_destroy_bucket_list(struct crush_bucket_list *b)
69{
70 kfree(b->item_weights);
71 kfree(b->sum_weights);
72 kfree(b->h.perm);
73 kfree(b->h.items);
74 kfree(b);
75}
76
77void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
78{
79 kfree(b->node_weights);
80 kfree(b);
81}
82
83void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
84{
85 kfree(b->straws);
86 kfree(b->item_weights);
87 kfree(b->h.perm);
88 kfree(b->h.items);
89 kfree(b);
90}
91
92void crush_destroy_bucket(struct crush_bucket *b)
93{
94 switch (b->alg) {
95 case CRUSH_BUCKET_UNIFORM:
96 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
97 break;
98 case CRUSH_BUCKET_LIST:
99 crush_destroy_bucket_list((struct crush_bucket_list *)b);
100 break;
101 case CRUSH_BUCKET_TREE:
102 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
103 break;
104 case CRUSH_BUCKET_STRAW:
105 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
106 break;
107 }
108}
109
110/**
111 * crush_destroy - Destroy a crush_map
112 * @map: crush_map pointer
113 */
114void crush_destroy(struct crush_map *map)
115{
116 int b;
117
118 /* buckets */
119 if (map->buckets) {
120 for (b = 0; b < map->max_buckets; b++) {
121 if (map->buckets[b] == NULL)
122 continue;
123 crush_destroy_bucket(map->buckets[b]);
124 }
125 kfree(map->buckets);
126 }
127
128 /* rules */
129 if (map->rules) {
130 for (b = 0; b < map->max_rules; b++)
131 kfree(map->rules[b]);
132 kfree(map->rules);
133 }
134
135 kfree(map->bucket_parents);
136 kfree(map->device_parents);
137 kfree(map);
138}
139
140
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..9ac7e091126f
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,188 @@
1#ifndef _CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100static inline const char *crush_bucket_alg_name(int alg)
101{
102 switch (alg) {
103 case CRUSH_BUCKET_UNIFORM: return "uniform";
104 case CRUSH_BUCKET_LIST: return "list";
105 case CRUSH_BUCKET_TREE: return "tree";
106 case CRUSH_BUCKET_STRAW: return "straw";
107 default: return "unknown";
108 }
109}
110
111struct crush_bucket {
112 __s32 id; /* this'll be negative */
113 __u16 type; /* non-zero; type=0 is reserved for devices */
114 __u16 alg; /* one of CRUSH_BUCKET_* */
115 __u32 weight; /* 16-bit fixed point */
116 __u32 size; /* num items */
117 __s32 *items;
118
119 /*
120 * cached random permutation: used for uniform bucket and for
121 * the linear search fallback for the other bucket types.
122 */
123 __u32 perm_x; /* @x for which *perm is defined */
124 __u32 perm_n; /* num elements of *perm that are permuted/defined */
125 __u32 *perm;
126};
127
128struct crush_bucket_uniform {
129 struct crush_bucket h;
130 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
131};
132
133struct crush_bucket_list {
134 struct crush_bucket h;
135 __u32 *item_weights; /* 16-bit fixed point */
136 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
137 of weights 0..i, inclusive */
138};
139
140struct crush_bucket_tree {
141 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
142 actual items */
143 __u8 num_nodes;
144 __u32 *node_weights;
145};
146
147struct crush_bucket_straw {
148 struct crush_bucket h;
149 __u32 *item_weights; /* 16-bit fixed point */
150 __u32 *straws; /* 16-bit fixed point */
151};
152
153
154
155/*
156 * CRUSH map includes all buckets, rules, etc.
157 */
158struct crush_map {
159 struct crush_bucket **buckets;
160 struct crush_rule **rules;
161
162 /*
163 * Parent pointers to identify the parent bucket a device or
164 * bucket in the hierarchy. If an item appears more than
165 * once, this is the _last_ time it appeared (where buckets
166 * are processed in bucket id order, from -1 on down to
167 * -max_buckets.
168 */
169 __u32 *bucket_parents;
170 __u32 *device_parents;
171
172 __s32 max_buckets;
173 __u32 max_rules;
174 __s32 max_devices;
175};
176
177
178/* crush.c */
179extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
180extern void crush_calc_parents(struct crush_map *map);
181extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
182extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
183extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
184extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
185extern void crush_destroy_bucket(struct crush_bucket *b);
186extern void crush_destroy(struct crush_map *map);
187
188#endif
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..42f3312783a1
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,90 @@
1#ifndef _CRUSH_HASH_H
2#define _CRUSH_HASH_H
3
4/*
5 * Robert Jenkins' function for mixing 32-bit values
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * a, b = random bits, c = input and output
8 */
9#define crush_hashmix(a, b, c) do { \
10 a = a-b; a = a-c; a = a^(c>>13); \
11 b = b-c; b = b-a; b = b^(a<<8); \
12 c = c-a; c = c-b; c = c^(b>>13); \
13 a = a-b; a = a-c; a = a^(c>>12); \
14 b = b-c; b = b-a; b = b^(a<<16); \
15 c = c-a; c = c-b; c = c^(b>>5); \
16 a = a-b; a = a-c; a = a^(c>>3); \
17 b = b-c; b = b-a; b = b^(a<<10); \
18 c = c-a; c = c-b; c = c^(b>>15); \
19 } while (0)
20
21#define crush_hash_seed 1315423911
22
23static inline __u32 crush_hash32(__u32 a)
24{
25 __u32 hash = crush_hash_seed ^ a;
26 __u32 b = a;
27 __u32 x = 231232;
28 __u32 y = 1232;
29 crush_hashmix(b, x, hash);
30 crush_hashmix(y, a, hash);
31 return hash;
32}
33
34static inline __u32 crush_hash32_2(__u32 a, __u32 b)
35{
36 __u32 hash = crush_hash_seed ^ a ^ b;
37 __u32 x = 231232;
38 __u32 y = 1232;
39 crush_hashmix(a, b, hash);
40 crush_hashmix(x, a, hash);
41 crush_hashmix(b, y, hash);
42 return hash;
43}
44
45static inline __u32 crush_hash32_3(__u32 a, __u32 b, __u32 c)
46{
47 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
48 __u32 x = 231232;
49 __u32 y = 1232;
50 crush_hashmix(a, b, hash);
51 crush_hashmix(c, x, hash);
52 crush_hashmix(y, a, hash);
53 crush_hashmix(b, x, hash);
54 crush_hashmix(y, c, hash);
55 return hash;
56}
57
58static inline __u32 crush_hash32_4(__u32 a, __u32 b, __u32 c,
59 __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static inline __u32 crush_hash32_5(__u32 a, __u32 b, __u32 c,
74 __u32 d, __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..0f0730c62695
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,589 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(x, bucket->h.items[i], r,
142 bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(x, n, r, bucket->h.id) * (__u64)w;
202 t = t >> 32;
203
204 /* descend to the left or right? */
205 l = left(n);
206 if (t < bucket->node_weights[l])
207 n = l;
208 else
209 n = right(n);
210 }
211
212 return bucket->h.items[n >> 1];
213}
214
215
216/* straw */
217
218static int bucket_straw_choose(struct crush_bucket_straw *bucket,
219 int x, int r)
220{
221 int i;
222 int high = 0;
223 __u64 high_draw = 0;
224 __u64 draw;
225
226 for (i = 0; i < bucket->h.size; i++) {
227 draw = crush_hash32_3(x, bucket->h.items[i], r);
228 draw &= 0xffff;
229 draw *= bucket->straws[i];
230 if (i == 0 || draw > high_draw) {
231 high = i;
232 high_draw = draw;
233 }
234 }
235 return bucket->h.items[high];
236}
237
238static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
239{
240 dprintk("choose %d x=%d r=%d\n", in->id, x, r);
241 switch (in->alg) {
242 case CRUSH_BUCKET_UNIFORM:
243 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
244 x, r);
245 case CRUSH_BUCKET_LIST:
246 return bucket_list_choose((struct crush_bucket_list *)in,
247 x, r);
248 case CRUSH_BUCKET_TREE:
249 return bucket_tree_choose((struct crush_bucket_tree *)in,
250 x, r);
251 case CRUSH_BUCKET_STRAW:
252 return bucket_straw_choose((struct crush_bucket_straw *)in,
253 x, r);
254 default:
255 BUG_ON(1);
256/* return in->items[0] */;
257 }
258}
259
260/*
261 * true if device is marked "out" (failed, fully offloaded)
262 * of the cluster
263 */
264static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
265{
266 if (weight[item] >= 0x1000)
267 return 0;
268 if (weight[item] == 0)
269 return 1;
270 if ((crush_hash32_2(x, item) & 0xffff) < weight[item])
271 return 0;
272 return 1;
273}
274
275/**
276 * crush_choose - choose numrep distinct items of given type
277 * @map: the crush_map
278 * @bucket: the bucket we are choose an item from
279 * @x: crush input value
280 * @numrep: the number of items to choose
281 * @type: the type of item to choose
282 * @out: pointer to output vector
283 * @outpos: our position in that vector
284 * @firstn: true if choosing "first n" items, false if choosing "indep"
285 * @recurse_to_leaf: true if we want one device under each item of given type
286 * @out2: second output vector for leaf items (if @recurse_to_leaf)
287 */
288static int crush_choose(struct crush_map *map,
289 struct crush_bucket *bucket,
290 __u32 *weight,
291 int x, int numrep, int type,
292 int *out, int outpos,
293 int firstn, int recurse_to_leaf,
294 int *out2)
295{
296 int rep;
297 int ftotal, flocal;
298 int retry_descent, retry_bucket, skip_rep;
299 struct crush_bucket *in = bucket;
300 int r;
301 int i;
302 int item;
303 int itemtype;
304 int collide, reject;
305 const int orig_tries = 5; /* attempts before we fall back to search */
306 dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
307
308 for (rep = outpos; rep < numrep; rep++) {
309 /* keep trying until we get a non-out, non-colliding item */
310 ftotal = 0;
311 skip_rep = 0;
312 do {
313 retry_descent = 0;
314 in = bucket; /* initial bucket */
315
316 /* choose through intervening buckets */
317 flocal = 0;
318 do {
319 retry_bucket = 0;
320 r = rep;
321 if (in->alg == CRUSH_BUCKET_UNIFORM) {
322 /* be careful */
323 if (firstn || numrep >= in->size)
324 /* r' = r + f_total */
325 r += ftotal;
326 else if (in->size % numrep == 0)
327 /* r'=r+(n+1)*f_local */
328 r += (numrep+1) *
329 (flocal+ftotal);
330 else
331 /* r' = r + n*f_local */
332 r += numrep * (flocal+ftotal);
333 } else {
334 if (firstn)
335 /* r' = r + f_total */
336 r += ftotal;
337 else
338 /* r' = r + n*f_local */
339 r += numrep * (flocal+ftotal);
340 }
341
342 /* bucket choose */
343 if (flocal >= (in->size>>1) &&
344 flocal > orig_tries)
345 item = bucket_perm_choose(in, x, r);
346 else
347 item = crush_bucket_choose(in, x, r);
348 BUG_ON(item >= map->max_devices);
349
350 /* desired type? */
351 if (item < 0)
352 itemtype = map->buckets[-1-item]->type;
353 else
354 itemtype = 0;
355 dprintk(" item %d type %d\n", item, itemtype);
356
357 /* keep going? */
358 if (itemtype != type) {
359 BUG_ON(item >= 0 ||
360 (-1-item) >= map->max_buckets);
361 in = map->buckets[-1-item];
362 continue;
363 }
364
365 /* collision? */
366 collide = 0;
367 for (i = 0; i < outpos; i++) {
368 if (out[i] == item) {
369 collide = 1;
370 break;
371 }
372 }
373
374 if (recurse_to_leaf &&
375 item < 0 &&
376 crush_choose(map, map->buckets[-1-item],
377 weight,
378 x, outpos+1, 0,
379 out2, outpos,
380 firstn, 0, NULL) <= outpos) {
381 reject = 1;
382 } else {
383 /* out? */
384 if (itemtype == 0)
385 reject = is_out(map, weight,
386 item, x);
387 else
388 reject = 0;
389 }
390
391 if (reject || collide) {
392 ftotal++;
393 flocal++;
394
395 if (collide && flocal < 3)
396 /* retry locally a few times */
397 retry_bucket = 1;
398 else if (flocal < in->size + orig_tries)
399 /* exhaustive bucket search */
400 retry_bucket = 1;
401 else if (ftotal < 20)
402 /* then retry descent */
403 retry_descent = 1;
404 else
405 /* else give up */
406 skip_rep = 1;
407 dprintk(" reject %d collide %d "
408 "ftotal %d flocal %d\n",
409 reject, collide, ftotal,
410 flocal);
411 }
412 } while (retry_bucket);
413 } while (retry_descent);
414
415 if (skip_rep) {
416 dprintk("skip rep\n");
417 continue;
418 }
419
420 dprintk("choose got %d\n", item);
421 out[outpos] = item;
422 outpos++;
423 }
424
425 dprintk("choose returns %d\n", outpos);
426 return outpos;
427}
428
429
430/**
431 * crush_do_rule - calculate a mapping with the given input and rule
432 * @map: the crush_map
433 * @ruleno: the rule id
434 * @x: hash input
435 * @result: pointer to result vector
436 * @result_max: maximum result size
437 * @force: force initial replica choice; -1 for none
438 */
439int crush_do_rule(struct crush_map *map,
440 int ruleno, int x, int *result, int result_max,
441 int force, __u32 *weight)
442{
443 int result_len;
444 int force_context[CRUSH_MAX_DEPTH];
445 int force_pos = -1;
446 int a[CRUSH_MAX_SET];
447 int b[CRUSH_MAX_SET];
448 int c[CRUSH_MAX_SET];
449 int recurse_to_leaf;
450 int *w;
451 int wsize = 0;
452 int *o;
453 int osize;
454 int *tmp;
455 struct crush_rule *rule;
456 int step;
457 int i, j;
458 int numrep;
459 int firstn;
460 int rc = -1;
461
462 BUG_ON(ruleno >= map->max_rules);
463
464 rule = map->rules[ruleno];
465 result_len = 0;
466 w = a;
467 o = b;
468
469 /*
470 * determine hierarchical context of force, if any. note
471 * that this may or may not correspond to the specific types
472 * referenced by the crush rule.
473 */
474 if (force >= 0) {
475 if (force >= map->max_devices ||
476 map->device_parents[force] == 0) {
477 /*dprintk("CRUSH: forcefed device dne\n");*/
478 rc = -1; /* force fed device dne */
479 goto out;
480 }
481 if (!is_out(map, weight, force, x)) {
482 while (1) {
483 force_context[++force_pos] = force;
484 if (force >= 0)
485 force = map->device_parents[force];
486 else
487 force = map->bucket_parents[-1-force];
488 if (force == 0)
489 break;
490 }
491 }
492 }
493
494 for (step = 0; step < rule->len; step++) {
495 firstn = 0;
496 switch (rule->steps[step].op) {
497 case CRUSH_RULE_TAKE:
498 w[0] = rule->steps[step].arg1;
499 if (force_pos >= 0) {
500 BUG_ON(force_context[force_pos] != w[0]);
501 force_pos--;
502 }
503 wsize = 1;
504 break;
505
506 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
507 case CRUSH_RULE_CHOOSE_FIRSTN:
508 firstn = 1;
509 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
510 case CRUSH_RULE_CHOOSE_INDEP:
511 BUG_ON(wsize == 0);
512
513 recurse_to_leaf =
514 rule->steps[step].op ==
515 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
516 rule->steps[step].op ==
517 CRUSH_RULE_CHOOSE_LEAF_INDEP;
518
519 /* reset output */
520 osize = 0;
521
522 for (i = 0; i < wsize; i++) {
523 /*
524 * see CRUSH_N, CRUSH_N_MINUS macros.
525 * basically, numrep <= 0 means relative to
526 * the provided result_max
527 */
528 numrep = rule->steps[step].arg1;
529 if (numrep <= 0) {
530 numrep += result_max;
531 if (numrep <= 0)
532 continue;
533 }
534 j = 0;
535 if (osize == 0 && force_pos >= 0) {
536 /* skip any intermediate types */
537 while (force_pos &&
538 force_context[force_pos] < 0 &&
539 rule->steps[step].arg2 !=
540 map->buckets[-1 -
541 force_context[force_pos]]->type)
542 force_pos--;
543 o[osize] = force_context[force_pos];
544 if (recurse_to_leaf)
545 c[osize] = force_context[0];
546 j++;
547 force_pos--;
548 }
549 osize += crush_choose(map,
550 map->buckets[-1-w[i]],
551 weight,
552 x, numrep,
553 rule->steps[step].arg2,
554 o+osize, j,
555 firstn,
556 recurse_to_leaf, c+osize);
557 }
558
559 if (recurse_to_leaf)
560 /* copy final _leaf_ values to output set */
561 memcpy(o, c, osize*sizeof(*o));
562
563 /* swap t and w arrays */
564 tmp = o;
565 o = w;
566 w = tmp;
567 wsize = osize;
568 break;
569
570
571 case CRUSH_RULE_EMIT:
572 for (i = 0; i < wsize && result_len < result_max; i++) {
573 result[result_len] = w[i];
574 result_len++;
575 }
576 wsize = 0;
577 break;
578
579 default:
580 BUG_ON(1);
581 }
582 }
583 rc = result_len;
584
585out:
586 return rc;
587}
588
589
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
1#ifndef _CRUSH_MAPPER_H
2#define _CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif