diff options
author | Sage Weil <sage@inktank.com> | 2012-07-30 21:15:23 -0400 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2012-07-30 21:15:23 -0400 |
commit | 546f04ef716dd49521774653d8b032a7d64c05d9 (patch) | |
tree | 162f548fc7a81b05eb1db715997b3a04693c1bcc | |
parent | 1fe60e51a3744528f3939b1b1167ca909133d9ae (diff) |
libceph: support crush tunables
The server side recently added support for tuning some magic
crush variables. Decode these variables if they are present, or use the
default values if they are not present.
Corresponds to ceph.git commit 89af369c25f274fe62ef730e5e8aad0c54f1e5a5.
Signed-off-by: caleb miles <caleb.miles@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
-rw-r--r-- | include/linux/ceph/ceph_features.h | 5 | ||||
-rw-r--r-- | include/linux/crush/crush.h | 8 | ||||
-rw-r--r-- | net/ceph/crush/mapper.c | 13 | ||||
-rw-r--r-- | net/ceph/osdmap.c | 39 |
4 files changed, 58 insertions, 7 deletions
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 342f93dbe162..dad579b0c0e6 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h | |||
@@ -12,12 +12,15 @@ | |||
12 | #define CEPH_FEATURE_MONNAMES (1<<5) | 12 | #define CEPH_FEATURE_MONNAMES (1<<5) |
13 | #define CEPH_FEATURE_RECONNECT_SEQ (1<<6) | 13 | #define CEPH_FEATURE_RECONNECT_SEQ (1<<6) |
14 | #define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) | 14 | #define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) |
15 | /* bits 8-17 defined by user-space; not supported yet here */ | ||
16 | #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) | ||
15 | 17 | ||
16 | /* | 18 | /* |
17 | * Features supported. | 19 | * Features supported. |
18 | */ | 20 | */ |
19 | #define CEPH_FEATURES_SUPPORTED_DEFAULT \ | 21 | #define CEPH_FEATURES_SUPPORTED_DEFAULT \ |
20 | (CEPH_FEATURE_NOSRCADDR) | 22 | (CEPH_FEATURE_NOSRCADDR | \ |
23 | CEPH_FEATURE_CRUSH_TUNABLES) | ||
21 | 24 | ||
22 | #define CEPH_FEATURES_REQUIRED_DEFAULT \ | 25 | #define CEPH_FEATURES_REQUIRED_DEFAULT \ |
23 | (CEPH_FEATURE_NOSRCADDR) | 26 | (CEPH_FEATURE_NOSRCADDR) |
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 7c4750811b96..25baa287cff7 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h | |||
@@ -154,6 +154,14 @@ struct crush_map { | |||
154 | __s32 max_buckets; | 154 | __s32 max_buckets; |
155 | __u32 max_rules; | 155 | __u32 max_rules; |
156 | __s32 max_devices; | 156 | __s32 max_devices; |
157 | |||
158 | /* choose local retries before re-descent */ | ||
159 | __u32 choose_local_tries; | ||
160 | /* choose local attempts using a fallback permutation before | ||
161 | * re-descent */ | ||
162 | __u32 choose_local_fallback_tries; | ||
163 | /* choose attempts before giving up */ | ||
164 | __u32 choose_total_tries; | ||
157 | }; | 165 | }; |
158 | 166 | ||
159 | 167 | ||
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index d7edc24333b8..35fce755ce10 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c | |||
@@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map, | |||
306 | int item = 0; | 306 | int item = 0; |
307 | int itemtype; | 307 | int itemtype; |
308 | int collide, reject; | 308 | int collide, reject; |
309 | const unsigned int orig_tries = 5; /* attempts before we fall back to search */ | ||
310 | 309 | ||
311 | dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", | 310 | dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", |
312 | bucket->id, x, outpos, numrep); | 311 | bucket->id, x, outpos, numrep); |
@@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map, | |||
351 | reject = 1; | 350 | reject = 1; |
352 | goto reject; | 351 | goto reject; |
353 | } | 352 | } |
354 | if (flocal >= (in->size>>1) && | 353 | if (map->choose_local_fallback_tries > 0 && |
355 | flocal > orig_tries) | 354 | flocal >= (in->size>>1) && |
355 | flocal > map->choose_local_fallback_tries) | ||
356 | item = bucket_perm_choose(in, x, r); | 356 | item = bucket_perm_choose(in, x, r); |
357 | else | 357 | else |
358 | item = crush_bucket_choose(in, x, r); | 358 | item = crush_bucket_choose(in, x, r); |
@@ -422,13 +422,14 @@ reject: | |||
422 | ftotal++; | 422 | ftotal++; |
423 | flocal++; | 423 | flocal++; |
424 | 424 | ||
425 | if (collide && flocal < 3) | 425 | if (collide && flocal <= map->choose_local_tries) |
426 | /* retry locally a few times */ | 426 | /* retry locally a few times */ |
427 | retry_bucket = 1; | 427 | retry_bucket = 1; |
428 | else if (flocal <= in->size + orig_tries) | 428 | else if (map->choose_local_fallback_tries > 0 && |
429 | flocal <= in->size + map->choose_local_fallback_tries) | ||
429 | /* exhaustive bucket search */ | 430 | /* exhaustive bucket search */ |
430 | retry_bucket = 1; | 431 | retry_bucket = 1; |
431 | else if (ftotal < 20) | 432 | else if (ftotal <= map->choose_total_tries) |
432 | /* then retry descent */ | 433 | /* then retry descent */ |
433 | retry_descent = 1; | 434 | retry_descent = 1; |
434 | else | 435 | else |
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 9600674c2c39..3124b71a8883 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c | |||
@@ -135,6 +135,21 @@ bad: | |||
135 | return -EINVAL; | 135 | return -EINVAL; |
136 | } | 136 | } |
137 | 137 | ||
138 | static int skip_name_map(void **p, void *end) | ||
139 | { | ||
140 | int len; | ||
141 | ceph_decode_32_safe(p, end, len ,bad); | ||
142 | while (len--) { | ||
143 | int strlen; | ||
144 | *p += sizeof(u32); | ||
145 | ceph_decode_32_safe(p, end, strlen, bad); | ||
146 | *p += strlen; | ||
147 | } | ||
148 | return 0; | ||
149 | bad: | ||
150 | return -EINVAL; | ||
151 | } | ||
152 | |||
138 | static struct crush_map *crush_decode(void *pbyval, void *end) | 153 | static struct crush_map *crush_decode(void *pbyval, void *end) |
139 | { | 154 | { |
140 | struct crush_map *c; | 155 | struct crush_map *c; |
@@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) | |||
143 | void **p = &pbyval; | 158 | void **p = &pbyval; |
144 | void *start = pbyval; | 159 | void *start = pbyval; |
145 | u32 magic; | 160 | u32 magic; |
161 | u32 num_name_maps; | ||
146 | 162 | ||
147 | dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); | 163 | dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); |
148 | 164 | ||
@@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end) | |||
150 | if (c == NULL) | 166 | if (c == NULL) |
151 | return ERR_PTR(-ENOMEM); | 167 | return ERR_PTR(-ENOMEM); |
152 | 168 | ||
169 | /* set tunables to default values */ | ||
170 | c->choose_local_tries = 2; | ||
171 | c->choose_local_fallback_tries = 5; | ||
172 | c->choose_total_tries = 19; | ||
173 | |||
153 | ceph_decode_need(p, end, 4*sizeof(u32), bad); | 174 | ceph_decode_need(p, end, 4*sizeof(u32), bad); |
154 | magic = ceph_decode_32(p); | 175 | magic = ceph_decode_32(p); |
155 | if (magic != CRUSH_MAGIC) { | 176 | if (magic != CRUSH_MAGIC) { |
@@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end) | |||
297 | } | 318 | } |
298 | 319 | ||
299 | /* ignore trailing name maps. */ | 320 | /* ignore trailing name maps. */ |
321 | for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) { | ||
322 | err = skip_name_map(p, end); | ||
323 | if (err < 0) | ||
324 | goto done; | ||
325 | } | ||
326 | |||
327 | /* tunables */ | ||
328 | ceph_decode_need(p, end, 3*sizeof(u32), done); | ||
329 | c->choose_local_tries = ceph_decode_32(p); | ||
330 | c->choose_local_fallback_tries = ceph_decode_32(p); | ||
331 | c->choose_total_tries = ceph_decode_32(p); | ||
332 | dout("crush decode tunable choose_local_tries = %d", | ||
333 | c->choose_local_tries); | ||
334 | dout("crush decode tunable choose_local_fallback_tries = %d", | ||
335 | c->choose_local_fallback_tries); | ||
336 | dout("crush decode tunable choose_total_tries = %d", | ||
337 | c->choose_total_tries); | ||
300 | 338 | ||
339 | done: | ||
301 | dout("crush_decode success\n"); | 340 | dout("crush_decode success\n"); |
302 | return c; | 341 | return c; |
303 | 342 | ||