aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-02-05 22:52:57 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-02-05 22:52:57 -0500
commit5d6a6a75e0036a88ac17a844686e4a9ef926e493 (patch)
tree85bcb8a1ee20d5ebfca8aa23f0fa835ce8a5f87b
parent9b108828ed25aff1239304437ec5fa8b9977a306 (diff)
parentb0b31a8ffe54abf0a455bcaee54dd92f08817164 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph fixes from Sage Weil: "We have a few wire protocol compatibility fixes, ports of a few recent CRUSH mapping changes, and a couple error path fixes" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: libceph: MOSDOpReply v7 encoding libceph: advertise support for TUNABLES5 crush: decode and initialize chooseleaf_stable crush: add chooseleaf_stable tunable crush: ensure take bucket value is valid crush: ensure bucket id is valid before indexing buckets array ceph: fix snap context leak in error path ceph: checking for IS_ERR instead of NULL
-rw-r--r--fs/ceph/file.c6
-rw-r--r--include/linux/ceph/ceph_features.h16
-rw-r--r--include/linux/crush/crush.h8
-rw-r--r--net/ceph/crush/mapper.c33
-rw-r--r--net/ceph/osd_client.c10
-rw-r--r--net/ceph/osdmap.c19
6 files changed, 75 insertions, 17 deletions
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 86a9c383955e..eb9028e8cfc5 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -698,8 +698,8 @@ static void ceph_aio_retry_work(struct work_struct *work)
698 698
699 req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2, 699 req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
700 false, GFP_NOFS); 700 false, GFP_NOFS);
701 if (IS_ERR(req)) { 701 if (!req) {
702 ret = PTR_ERR(req); 702 ret = -ENOMEM;
703 req = orig_req; 703 req = orig_req;
704 goto out; 704 goto out;
705 } 705 }
@@ -716,7 +716,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
716 ceph_osdc_build_request(req, req->r_ops[0].extent.offset, 716 ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
717 snapc, CEPH_NOSNAP, &aio_req->mtime); 717 snapc, CEPH_NOSNAP, &aio_req->mtime);
718 718
719 ceph_put_snap_context(snapc);
720 ceph_osdc_put_request(orig_req); 719 ceph_osdc_put_request(orig_req);
721 720
722 req->r_callback = ceph_aio_complete_req; 721 req->r_callback = ceph_aio_complete_req;
@@ -731,6 +730,7 @@ out:
731 ceph_aio_complete_req(req, NULL); 730 ceph_aio_complete_req(req, NULL);
732 } 731 }
733 732
733 ceph_put_snap_context(snapc);
734 kfree(aio_work); 734 kfree(aio_work);
735} 735}
736 736
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index f89b31d45cc8..c1ef6f14e7be 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -63,6 +63,18 @@
63#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49) 63#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49)
64// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY 64// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
65#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ 65#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */
66#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
67#define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */
68#define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52)
69#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53)
70#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54)
71#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
72#define CEPH_FEATURE_NEW_OSDOP_ENCODING (1ULL<<56) /* New, v7 encoding */
73#define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */
74#define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */
75#define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */
76// duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5
77#define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING (1ULL<<58) /* New, v7 encoding */
66 78
67/* 79/*
68 * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature 80 * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
@@ -108,7 +120,9 @@ static inline u64 ceph_sanitize_features(u64 features)
108 CEPH_FEATURE_CRUSH_TUNABLES3 | \ 120 CEPH_FEATURE_CRUSH_TUNABLES3 | \
109 CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ 121 CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \
110 CEPH_FEATURE_MSGR_KEEPALIVE2 | \ 122 CEPH_FEATURE_MSGR_KEEPALIVE2 | \
111 CEPH_FEATURE_CRUSH_V4) 123 CEPH_FEATURE_CRUSH_V4 | \
124 CEPH_FEATURE_CRUSH_TUNABLES5 | \
125 CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING)
112 126
113#define CEPH_FEATURES_REQUIRED_DEFAULT \ 127#define CEPH_FEATURES_REQUIRED_DEFAULT \
114 (CEPH_FEATURE_NOSRCADDR | \ 128 (CEPH_FEATURE_NOSRCADDR | \
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 48b49305716b..be8f12b8f195 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -59,7 +59,8 @@ enum {
59 CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ 59 CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
60 CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, 60 CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
61 CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, 61 CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
62 CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12 62 CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
63 CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
63}; 64};
64 65
65/* 66/*
@@ -205,6 +206,11 @@ struct crush_map {
205 * mappings line up a bit better with previous mappings. */ 206 * mappings line up a bit better with previous mappings. */
206 __u8 chooseleaf_vary_r; 207 __u8 chooseleaf_vary_r;
207 208
209 /* if true, it makes chooseleaf firstn to return stable results (if
210 * no local retry) so that data migrations would be optimal when some
211 * device fails. */
212 __u8 chooseleaf_stable;
213
208#ifndef __KERNEL__ 214#ifndef __KERNEL__
209 /* 215 /*
210 * version 0 (original) of straw_calc has various flaws. version 1 216 * version 0 (original) of straw_calc has various flaws. version 1
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 393bfb22d5bb..5fcfb98f309e 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map,
403 * @local_retries: localized retries 403 * @local_retries: localized retries
404 * @local_fallback_retries: localized fallback retries 404 * @local_fallback_retries: localized fallback retries
405 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) 405 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
406 * @stable: stable mode starts rep=0 in the recursive call for all replicas
406 * @vary_r: pass r to recursive calls 407 * @vary_r: pass r to recursive calls
407 * @out2: second output vector for leaf items (if @recurse_to_leaf) 408 * @out2: second output vector for leaf items (if @recurse_to_leaf)
408 * @parent_r: r value passed from the parent 409 * @parent_r: r value passed from the parent
@@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map,
419 unsigned int local_fallback_retries, 420 unsigned int local_fallback_retries,
420 int recurse_to_leaf, 421 int recurse_to_leaf,
421 unsigned int vary_r, 422 unsigned int vary_r,
423 unsigned int stable,
422 int *out2, 424 int *out2,
423 int parent_r) 425 int parent_r)
424{ 426{
@@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map,
433 int collide, reject; 435 int collide, reject;
434 int count = out_size; 436 int count = out_size;
435 437
436 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", 438 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n",
437 recurse_to_leaf ? "_LEAF" : "", 439 recurse_to_leaf ? "_LEAF" : "",
438 bucket->id, x, outpos, numrep, 440 bucket->id, x, outpos, numrep,
439 tries, recurse_tries, local_retries, local_fallback_retries, 441 tries, recurse_tries, local_retries, local_fallback_retries,
440 parent_r); 442 parent_r, stable);
441 443
442 for (rep = outpos; rep < numrep && count > 0 ; rep++) { 444 for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
443 /* keep trying until we get a non-out, non-colliding item */ 445 /* keep trying until we get a non-out, non-colliding item */
444 ftotal = 0; 446 ftotal = 0;
445 skip_rep = 0; 447 skip_rep = 0;
@@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map,
512 if (crush_choose_firstn(map, 514 if (crush_choose_firstn(map,
513 map->buckets[-1-item], 515 map->buckets[-1-item],
514 weight, weight_max, 516 weight, weight_max,
515 x, outpos+1, 0, 517 x, stable ? 1 : outpos+1, 0,
516 out2, outpos, count, 518 out2, outpos, count,
517 recurse_tries, 0, 519 recurse_tries, 0,
518 local_retries, 520 local_retries,
519 local_fallback_retries, 521 local_fallback_retries,
520 0, 522 0,
521 vary_r, 523 vary_r,
524 stable,
522 NULL, 525 NULL,
523 sub_r) <= outpos) 526 sub_r) <= outpos)
524 /* didn't get leaf */ 527 /* didn't get leaf */
@@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map,
816 int choose_local_fallback_retries = map->choose_local_fallback_tries; 819 int choose_local_fallback_retries = map->choose_local_fallback_tries;
817 820
818 int vary_r = map->chooseleaf_vary_r; 821 int vary_r = map->chooseleaf_vary_r;
822 int stable = map->chooseleaf_stable;
819 823
820 if ((__u32)ruleno >= map->max_rules) { 824 if ((__u32)ruleno >= map->max_rules) {
821 dprintk(" bad ruleno %d\n", ruleno); 825 dprintk(" bad ruleno %d\n", ruleno);
@@ -835,7 +839,8 @@ int crush_do_rule(const struct crush_map *map,
835 case CRUSH_RULE_TAKE: 839 case CRUSH_RULE_TAKE:
836 if ((curstep->arg1 >= 0 && 840 if ((curstep->arg1 >= 0 &&
837 curstep->arg1 < map->max_devices) || 841 curstep->arg1 < map->max_devices) ||
838 (-1-curstep->arg1 < map->max_buckets && 842 (-1-curstep->arg1 >= 0 &&
843 -1-curstep->arg1 < map->max_buckets &&
839 map->buckets[-1-curstep->arg1])) { 844 map->buckets[-1-curstep->arg1])) {
840 w[0] = curstep->arg1; 845 w[0] = curstep->arg1;
841 wsize = 1; 846 wsize = 1;
@@ -869,6 +874,11 @@ int crush_do_rule(const struct crush_map *map,
869 vary_r = curstep->arg1; 874 vary_r = curstep->arg1;
870 break; 875 break;
871 876
877 case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
878 if (curstep->arg1 >= 0)
879 stable = curstep->arg1;
880 break;
881
872 case CRUSH_RULE_CHOOSELEAF_FIRSTN: 882 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
873 case CRUSH_RULE_CHOOSE_FIRSTN: 883 case CRUSH_RULE_CHOOSE_FIRSTN:
874 firstn = 1; 884 firstn = 1;
@@ -888,6 +898,7 @@ int crush_do_rule(const struct crush_map *map,
888 osize = 0; 898 osize = 0;
889 899
890 for (i = 0; i < wsize; i++) { 900 for (i = 0; i < wsize; i++) {
901 int bno;
891 /* 902 /*
892 * see CRUSH_N, CRUSH_N_MINUS macros. 903 * see CRUSH_N, CRUSH_N_MINUS macros.
893 * basically, numrep <= 0 means relative to 904 * basically, numrep <= 0 means relative to
@@ -900,6 +911,13 @@ int crush_do_rule(const struct crush_map *map,
900 continue; 911 continue;
901 } 912 }
902 j = 0; 913 j = 0;
914 /* make sure bucket id is valid */
915 bno = -1 - w[i];
916 if (bno < 0 || bno >= map->max_buckets) {
917 /* w[i] is probably CRUSH_ITEM_NONE */
918 dprintk(" bad w[i] %d\n", w[i]);
919 continue;
920 }
903 if (firstn) { 921 if (firstn) {
904 int recurse_tries; 922 int recurse_tries;
905 if (choose_leaf_tries) 923 if (choose_leaf_tries)
@@ -911,7 +929,7 @@ int crush_do_rule(const struct crush_map *map,
911 recurse_tries = choose_tries; 929 recurse_tries = choose_tries;
912 osize += crush_choose_firstn( 930 osize += crush_choose_firstn(
913 map, 931 map,
914 map->buckets[-1-w[i]], 932 map->buckets[bno],
915 weight, weight_max, 933 weight, weight_max,
916 x, numrep, 934 x, numrep,
917 curstep->arg2, 935 curstep->arg2,
@@ -923,6 +941,7 @@ int crush_do_rule(const struct crush_map *map,
923 choose_local_fallback_retries, 941 choose_local_fallback_retries,
924 recurse_to_leaf, 942 recurse_to_leaf,
925 vary_r, 943 vary_r,
944 stable,
926 c+osize, 945 c+osize,
927 0); 946 0);
928 } else { 947 } else {
@@ -930,7 +949,7 @@ int crush_do_rule(const struct crush_map *map,
930 numrep : (result_max-osize)); 949 numrep : (result_max-osize));
931 crush_choose_indep( 950 crush_choose_indep(
932 map, 951 map,
933 map->buckets[-1-w[i]], 952 map->buckets[bno],
934 weight, weight_max, 953 weight, weight_max,
935 x, out_size, numrep, 954 x, out_size, numrep,
936 curstep->arg2, 955 curstep->arg2,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index f8f235930d88..3534e12683d3 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1770,6 +1770,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1770 u32 osdmap_epoch; 1770 u32 osdmap_epoch;
1771 int already_completed; 1771 int already_completed;
1772 u32 bytes; 1772 u32 bytes;
1773 u8 decode_redir;
1773 unsigned int i; 1774 unsigned int i;
1774 1775
1775 tid = le64_to_cpu(msg->hdr.tid); 1776 tid = le64_to_cpu(msg->hdr.tid);
@@ -1841,6 +1842,15 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1841 p += 8 + 4; /* skip replay_version */ 1842 p += 8 + 4; /* skip replay_version */
1842 p += 8; /* skip user_version */ 1843 p += 8; /* skip user_version */
1843 1844
1845 if (le16_to_cpu(msg->hdr.version) >= 7)
1846 ceph_decode_8_safe(&p, end, decode_redir, bad_put);
1847 else
1848 decode_redir = 1;
1849 } else {
1850 decode_redir = 0;
1851 }
1852
1853 if (decode_redir) {
1844 err = ceph_redirect_decode(&p, end, &redir); 1854 err = ceph_redirect_decode(&p, end, &redir);
1845 if (err) 1855 if (err)
1846 goto bad_put; 1856 goto bad_put;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 7d8f581d9f1f..243574c8cf33 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -342,23 +342,32 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
342 c->choose_local_tries = ceph_decode_32(p); 342 c->choose_local_tries = ceph_decode_32(p);
343 c->choose_local_fallback_tries = ceph_decode_32(p); 343 c->choose_local_fallback_tries = ceph_decode_32(p);
344 c->choose_total_tries = ceph_decode_32(p); 344 c->choose_total_tries = ceph_decode_32(p);
345 dout("crush decode tunable choose_local_tries = %d", 345 dout("crush decode tunable choose_local_tries = %d\n",
346 c->choose_local_tries); 346 c->choose_local_tries);
347 dout("crush decode tunable choose_local_fallback_tries = %d", 347 dout("crush decode tunable choose_local_fallback_tries = %d\n",
348 c->choose_local_fallback_tries); 348 c->choose_local_fallback_tries);
349 dout("crush decode tunable choose_total_tries = %d", 349 dout("crush decode tunable choose_total_tries = %d\n",
350 c->choose_total_tries); 350 c->choose_total_tries);
351 351
352 ceph_decode_need(p, end, sizeof(u32), done); 352 ceph_decode_need(p, end, sizeof(u32), done);
353 c->chooseleaf_descend_once = ceph_decode_32(p); 353 c->chooseleaf_descend_once = ceph_decode_32(p);
354 dout("crush decode tunable chooseleaf_descend_once = %d", 354 dout("crush decode tunable chooseleaf_descend_once = %d\n",
355 c->chooseleaf_descend_once); 355 c->chooseleaf_descend_once);
356 356
357 ceph_decode_need(p, end, sizeof(u8), done); 357 ceph_decode_need(p, end, sizeof(u8), done);
358 c->chooseleaf_vary_r = ceph_decode_8(p); 358 c->chooseleaf_vary_r = ceph_decode_8(p);
359 dout("crush decode tunable chooseleaf_vary_r = %d", 359 dout("crush decode tunable chooseleaf_vary_r = %d\n",
360 c->chooseleaf_vary_r); 360 c->chooseleaf_vary_r);
361 361
362 /* skip straw_calc_version, allowed_bucket_algs */
363 ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done);
364 *p += sizeof(u8) + sizeof(u32);
365
366 ceph_decode_need(p, end, sizeof(u8), done);
367 c->chooseleaf_stable = ceph_decode_8(p);
368 dout("crush decode tunable chooseleaf_stable = %d\n",
369 c->chooseleaf_stable);
370
362done: 371done:
363 dout("crush_decode success\n"); 372 dout("crush_decode success\n");
364 return c; 373 return c;