aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph/crush/crush.h
diff options
context:
space:
mode:
authorSage Weil <sage@newdream.net>2009-10-06 14:31:11 -0400
committerSage Weil <sage@newdream.net>2009-10-06 14:31:11 -0400
commit5ecc0a0f8128b1876e8614638deaed49cc8b174c (patch)
tree6b1511b0e3ec2a349a24faebc375548b14ee137a /fs/ceph/crush/crush.h
parentf24e9980eb860d8600cbe5ef3d2fd9295320d229 (diff)
ceph: CRUSH mapping algorithm
CRUSH is a pseudorandom data distribution function designed to map inputs onto a dynamic hierarchy of devices, while minimizing the extent to which inputs are remapped when the devices are added or removed. It includes some features that are specifically useful for storage, most notably the ability to map each input onto a set of N devices that are separated across administrator-defined failure domains. CRUSH is used to distribute data across the cluster of Ceph storage nodes. More information about CRUSH can be found in this paper: http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf Signed-off-by: Sage Weil <sage@newdream.net>
Diffstat (limited to 'fs/ceph/crush/crush.h')
-rw-r--r--fs/ceph/crush/crush.h188
1 files changed, 188 insertions, 0 deletions
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..9ac7e091126f
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,188 @@
1#ifndef _CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100static inline const char *crush_bucket_alg_name(int alg)
101{
102 switch (alg) {
103 case CRUSH_BUCKET_UNIFORM: return "uniform";
104 case CRUSH_BUCKET_LIST: return "list";
105 case CRUSH_BUCKET_TREE: return "tree";
106 case CRUSH_BUCKET_STRAW: return "straw";
107 default: return "unknown";
108 }
109}
110
111struct crush_bucket {
112 __s32 id; /* this'll be negative */
113 __u16 type; /* non-zero; type=0 is reserved for devices */
114 __u16 alg; /* one of CRUSH_BUCKET_* */
115 __u32 weight; /* 16-bit fixed point */
116 __u32 size; /* num items */
117 __s32 *items;
118
119 /*
120 * cached random permutation: used for uniform bucket and for
121 * the linear search fallback for the other bucket types.
122 */
123 __u32 perm_x; /* @x for which *perm is defined */
124 __u32 perm_n; /* num elements of *perm that are permuted/defined */
125 __u32 *perm;
126};
127
128struct crush_bucket_uniform {
129 struct crush_bucket h;
130 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
131};
132
133struct crush_bucket_list {
134 struct crush_bucket h;
135 __u32 *item_weights; /* 16-bit fixed point */
136 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
137 of weights 0..i, inclusive */
138};
139
140struct crush_bucket_tree {
141 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
142 actual items */
143 __u8 num_nodes;
144 __u32 *node_weights;
145};
146
147struct crush_bucket_straw {
148 struct crush_bucket h;
149 __u32 *item_weights; /* 16-bit fixed point */
150 __u32 *straws; /* 16-bit fixed point */
151};
152
153
154
155/*
156 * CRUSH map includes all buckets, rules, etc.
157 */
158struct crush_map {
159 struct crush_bucket **buckets;
160 struct crush_rule **rules;
161
162 /*
163 * Parent pointers to identify the parent bucket a device or
164 * bucket in the hierarchy. If an item appears more than
165 * once, this is the _last_ time it appeared (where buckets
166 * are processed in bucket id order, from -1 on down to
167 * -max_buckets.
168 */
169 __u32 *bucket_parents;
170 __u32 *device_parents;
171
172 __s32 max_buckets;
173 __u32 max_rules;
174 __s32 max_devices;
175};
176
177
178/* crush.c */
179extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
180extern void crush_calc_parents(struct crush_map *map);
181extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
182extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
183extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
184extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
185extern void crush_destroy_bucket(struct crush_bucket *b);
186extern void crush_destroy(struct crush_map *map);
187
188#endif