aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2012-07-30 21:15:23 -0400
committerSage Weil <sage@inktank.com>2012-07-30 21:15:23 -0400
commit546f04ef716dd49521774653d8b032a7d64c05d9 (patch)
tree162f548fc7a81b05eb1db715997b3a04693c1bcc
parent1fe60e51a3744528f3939b1b1167ca909133d9ae (diff)
libceph: support crush tunables
The server side recently added support for tuning some magic crush variables. Decode these variables if they are present, or use the default values if they are not present. Corresponds to ceph.git commit 89af369c25f274fe62ef730e5e8aad0c54f1e5a5. Signed-off-by: caleb miles <caleb.miles@inktank.com> Reviewed-by: Sage Weil <sage@inktank.com> Reviewed-by: Alex Elder <elder@inktank.com> Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
-rw-r--r--include/linux/ceph/ceph_features.h5
-rw-r--r--include/linux/crush/crush.h8
-rw-r--r--net/ceph/crush/mapper.c13
-rw-r--r--net/ceph/osdmap.c39
4 files changed, 58 insertions, 7 deletions
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 342f93dbe162..dad579b0c0e6 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -12,12 +12,15 @@
12#define CEPH_FEATURE_MONNAMES (1<<5) 12#define CEPH_FEATURE_MONNAMES (1<<5)
13#define CEPH_FEATURE_RECONNECT_SEQ (1<<6) 13#define CEPH_FEATURE_RECONNECT_SEQ (1<<6)
14#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) 14#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7)
15/* bits 8-17 defined by user-space; not supported yet here */
16#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
15 17
16/* 18/*
17 * Features supported. 19 * Features supported.
18 */ 20 */
19#define CEPH_FEATURES_SUPPORTED_DEFAULT \ 21#define CEPH_FEATURES_SUPPORTED_DEFAULT \
20 (CEPH_FEATURE_NOSRCADDR) 22 (CEPH_FEATURE_NOSRCADDR | \
23 CEPH_FEATURE_CRUSH_TUNABLES)
21 24
22#define CEPH_FEATURES_REQUIRED_DEFAULT \ 25#define CEPH_FEATURES_REQUIRED_DEFAULT \
23 (CEPH_FEATURE_NOSRCADDR) 26 (CEPH_FEATURE_NOSRCADDR)
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 7c4750811b96..25baa287cff7 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -154,6 +154,14 @@ struct crush_map {
154 __s32 max_buckets; 154 __s32 max_buckets;
155 __u32 max_rules; 155 __u32 max_rules;
156 __s32 max_devices; 156 __s32 max_devices;
157
158 /* choose local retries before re-descent */
159 __u32 choose_local_tries;
160 /* choose local attempts using a fallback permutation before
161 * re-descent */
162 __u32 choose_local_fallback_tries;
163 /* choose attempts before giving up */
164 __u32 choose_total_tries;
157}; 165};
158 166
159 167
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index d7edc24333b8..35fce755ce10 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map,
306 int item = 0; 306 int item = 0;
307 int itemtype; 307 int itemtype;
308 int collide, reject; 308 int collide, reject;
309 const unsigned int orig_tries = 5; /* attempts before we fall back to search */
310 309
311 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", 310 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
312 bucket->id, x, outpos, numrep); 311 bucket->id, x, outpos, numrep);
@@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map,
351 reject = 1; 350 reject = 1;
352 goto reject; 351 goto reject;
353 } 352 }
354 if (flocal >= (in->size>>1) && 353 if (map->choose_local_fallback_tries > 0 &&
355 flocal > orig_tries) 354 flocal >= (in->size>>1) &&
355 flocal > map->choose_local_fallback_tries)
356 item = bucket_perm_choose(in, x, r); 356 item = bucket_perm_choose(in, x, r);
357 else 357 else
358 item = crush_bucket_choose(in, x, r); 358 item = crush_bucket_choose(in, x, r);
@@ -422,13 +422,14 @@ reject:
422 ftotal++; 422 ftotal++;
423 flocal++; 423 flocal++;
424 424
425 if (collide && flocal < 3) 425 if (collide && flocal <= map->choose_local_tries)
426 /* retry locally a few times */ 426 /* retry locally a few times */
427 retry_bucket = 1; 427 retry_bucket = 1;
428 else if (flocal <= in->size + orig_tries) 428 else if (map->choose_local_fallback_tries > 0 &&
429 flocal <= in->size + map->choose_local_fallback_tries)
429 /* exhaustive bucket search */ 430 /* exhaustive bucket search */
430 retry_bucket = 1; 431 retry_bucket = 1;
431 else if (ftotal < 20) 432 else if (ftotal <= map->choose_total_tries)
432 /* then retry descent */ 433 /* then retry descent */
433 retry_descent = 1; 434 retry_descent = 1;
434 else 435 else
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 9600674c2c39..3124b71a8883 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -135,6 +135,21 @@ bad:
135 return -EINVAL; 135 return -EINVAL;
136} 136}
137 137
138static int skip_name_map(void **p, void *end)
139{
140 int len;
141 ceph_decode_32_safe(p, end, len ,bad);
142 while (len--) {
143 int strlen;
144 *p += sizeof(u32);
145 ceph_decode_32_safe(p, end, strlen, bad);
146 *p += strlen;
147}
148 return 0;
149bad:
150 return -EINVAL;
151}
152
138static struct crush_map *crush_decode(void *pbyval, void *end) 153static struct crush_map *crush_decode(void *pbyval, void *end)
139{ 154{
140 struct crush_map *c; 155 struct crush_map *c;
@@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
143 void **p = &pbyval; 158 void **p = &pbyval;
144 void *start = pbyval; 159 void *start = pbyval;
145 u32 magic; 160 u32 magic;
161 u32 num_name_maps;
146 162
147 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); 163 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
148 164
@@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
150 if (c == NULL) 166 if (c == NULL)
151 return ERR_PTR(-ENOMEM); 167 return ERR_PTR(-ENOMEM);
152 168
169 /* set tunables to default values */
170 c->choose_local_tries = 2;
171 c->choose_local_fallback_tries = 5;
172 c->choose_total_tries = 19;
173
153 ceph_decode_need(p, end, 4*sizeof(u32), bad); 174 ceph_decode_need(p, end, 4*sizeof(u32), bad);
154 magic = ceph_decode_32(p); 175 magic = ceph_decode_32(p);
155 if (magic != CRUSH_MAGIC) { 176 if (magic != CRUSH_MAGIC) {
@@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
297 } 318 }
298 319
299 /* ignore trailing name maps. */ 320 /* ignore trailing name maps. */
321 for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
322 err = skip_name_map(p, end);
323 if (err < 0)
324 goto done;
325 }
326
327 /* tunables */
328 ceph_decode_need(p, end, 3*sizeof(u32), done);
329 c->choose_local_tries = ceph_decode_32(p);
330 c->choose_local_fallback_tries = ceph_decode_32(p);
331 c->choose_total_tries = ceph_decode_32(p);
332 dout("crush decode tunable choose_local_tries = %d",
333 c->choose_local_tries);
334 dout("crush decode tunable choose_local_fallback_tries = %d",
335 c->choose_local_fallback_tries);
336 dout("crush decode tunable choose_total_tries = %d",
337 c->choose_total_tries);
300 338
339done:
301 dout("crush_decode success\n"); 340 dout("crush_decode success\n");
302 return c; 341 return c;
303 342