diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-02-05 22:52:57 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-02-05 22:52:57 -0500 |
commit | 5d6a6a75e0036a88ac17a844686e4a9ef926e493 (patch) | |
tree | 85bcb8a1ee20d5ebfca8aa23f0fa835ce8a5f87b | |
parent | 9b108828ed25aff1239304437ec5fa8b9977a306 (diff) | |
parent | b0b31a8ffe54abf0a455bcaee54dd92f08817164 (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph fixes from Sage Weil:
"We have a few wire protocol compatibility fixes, ports of a few recent
CRUSH mapping changes, and a couple error path fixes"
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
libceph: MOSDOpReply v7 encoding
libceph: advertise support for TUNABLES5
crush: decode and initialize chooseleaf_stable
crush: add chooseleaf_stable tunable
crush: ensure take bucket value is valid
crush: ensure bucket id is valid before indexing buckets array
ceph: fix snap context leak in error path
ceph: checking for IS_ERR instead of NULL
-rw-r--r-- | fs/ceph/file.c | 6 | ||||
-rw-r--r-- | include/linux/ceph/ceph_features.h | 16 | ||||
-rw-r--r-- | include/linux/crush/crush.h | 8 | ||||
-rw-r--r-- | net/ceph/crush/mapper.c | 33 | ||||
-rw-r--r-- | net/ceph/osd_client.c | 10 | ||||
-rw-r--r-- | net/ceph/osdmap.c | 19 |
6 files changed, 75 insertions, 17 deletions
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 86a9c383955e..eb9028e8cfc5 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
@@ -698,8 +698,8 @@ static void ceph_aio_retry_work(struct work_struct *work) | |||
698 | 698 | ||
699 | req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2, | 699 | req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2, |
700 | false, GFP_NOFS); | 700 | false, GFP_NOFS); |
701 | if (IS_ERR(req)) { | 701 | if (!req) { |
702 | ret = PTR_ERR(req); | 702 | ret = -ENOMEM; |
703 | req = orig_req; | 703 | req = orig_req; |
704 | goto out; | 704 | goto out; |
705 | } | 705 | } |
@@ -716,7 +716,6 @@ static void ceph_aio_retry_work(struct work_struct *work) | |||
716 | ceph_osdc_build_request(req, req->r_ops[0].extent.offset, | 716 | ceph_osdc_build_request(req, req->r_ops[0].extent.offset, |
717 | snapc, CEPH_NOSNAP, &aio_req->mtime); | 717 | snapc, CEPH_NOSNAP, &aio_req->mtime); |
718 | 718 | ||
719 | ceph_put_snap_context(snapc); | ||
720 | ceph_osdc_put_request(orig_req); | 719 | ceph_osdc_put_request(orig_req); |
721 | 720 | ||
722 | req->r_callback = ceph_aio_complete_req; | 721 | req->r_callback = ceph_aio_complete_req; |
@@ -731,6 +730,7 @@ out: | |||
731 | ceph_aio_complete_req(req, NULL); | 730 | ceph_aio_complete_req(req, NULL); |
732 | } | 731 | } |
733 | 732 | ||
733 | ceph_put_snap_context(snapc); | ||
734 | kfree(aio_work); | 734 | kfree(aio_work); |
735 | } | 735 | } |
736 | 736 | ||
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index f89b31d45cc8..c1ef6f14e7be 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h | |||
@@ -63,6 +63,18 @@ | |||
63 | #define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49) | 63 | #define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49) |
64 | // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY | 64 | // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY |
65 | #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ | 65 | #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ |
66 | #define CEPH_FEATURE_MON_METADATA (1ULL<<50) | ||
67 | #define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */ | ||
68 | #define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52) | ||
69 | #define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53) | ||
70 | #define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54) | ||
71 | #define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55) | ||
72 | #define CEPH_FEATURE_NEW_OSDOP_ENCODING (1ULL<<56) /* New, v7 encoding */ | ||
73 | #define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */ | ||
74 | #define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */ | ||
75 | #define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */ | ||
76 | // duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5 | ||
77 | #define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING (1ULL<<58) /* New, v7 encoding */ | ||
66 | 78 | ||
67 | /* | 79 | /* |
68 | * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature | 80 | * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature |
@@ -108,7 +120,9 @@ static inline u64 ceph_sanitize_features(u64 features) | |||
108 | CEPH_FEATURE_CRUSH_TUNABLES3 | \ | 120 | CEPH_FEATURE_CRUSH_TUNABLES3 | \ |
109 | CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ | 121 | CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ |
110 | CEPH_FEATURE_MSGR_KEEPALIVE2 | \ | 122 | CEPH_FEATURE_MSGR_KEEPALIVE2 | \ |
111 | CEPH_FEATURE_CRUSH_V4) | 123 | CEPH_FEATURE_CRUSH_V4 | \ |
124 | CEPH_FEATURE_CRUSH_TUNABLES5 | \ | ||
125 | CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING) | ||
112 | 126 | ||
113 | #define CEPH_FEATURES_REQUIRED_DEFAULT \ | 127 | #define CEPH_FEATURES_REQUIRED_DEFAULT \ |
114 | (CEPH_FEATURE_NOSRCADDR | \ | 128 | (CEPH_FEATURE_NOSRCADDR | \ |
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 48b49305716b..be8f12b8f195 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h | |||
@@ -59,7 +59,8 @@ enum { | |||
59 | CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ | 59 | CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ |
60 | CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, | 60 | CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, |
61 | CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, | 61 | CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, |
62 | CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12 | 62 | CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12, |
63 | CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13 | ||
63 | }; | 64 | }; |
64 | 65 | ||
65 | /* | 66 | /* |
@@ -205,6 +206,11 @@ struct crush_map { | |||
205 | * mappings line up a bit better with previous mappings. */ | 206 | * mappings line up a bit better with previous mappings. */ |
206 | __u8 chooseleaf_vary_r; | 207 | __u8 chooseleaf_vary_r; |
207 | 208 | ||
209 | /* if true, it makes chooseleaf firstn to return stable results (if | ||
210 | * no local retry) so that data migrations would be optimal when some | ||
211 | * device fails. */ | ||
212 | __u8 chooseleaf_stable; | ||
213 | |||
208 | #ifndef __KERNEL__ | 214 | #ifndef __KERNEL__ |
209 | /* | 215 | /* |
210 | * version 0 (original) of straw_calc has various flaws. version 1 | 216 | * version 0 (original) of straw_calc has various flaws. version 1 |
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 393bfb22d5bb..5fcfb98f309e 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c | |||
@@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map, | |||
403 | * @local_retries: localized retries | 403 | * @local_retries: localized retries |
404 | * @local_fallback_retries: localized fallback retries | 404 | * @local_fallback_retries: localized fallback retries |
405 | * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) | 405 | * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) |
406 | * @stable: stable mode starts rep=0 in the recursive call for all replicas | ||
406 | * @vary_r: pass r to recursive calls | 407 | * @vary_r: pass r to recursive calls |
407 | * @out2: second output vector for leaf items (if @recurse_to_leaf) | 408 | * @out2: second output vector for leaf items (if @recurse_to_leaf) |
408 | * @parent_r: r value passed from the parent | 409 | * @parent_r: r value passed from the parent |
@@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map, | |||
419 | unsigned int local_fallback_retries, | 420 | unsigned int local_fallback_retries, |
420 | int recurse_to_leaf, | 421 | int recurse_to_leaf, |
421 | unsigned int vary_r, | 422 | unsigned int vary_r, |
423 | unsigned int stable, | ||
422 | int *out2, | 424 | int *out2, |
423 | int parent_r) | 425 | int parent_r) |
424 | { | 426 | { |
@@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map, | |||
433 | int collide, reject; | 435 | int collide, reject; |
434 | int count = out_size; | 436 | int count = out_size; |
435 | 437 | ||
436 | dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", | 438 | dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n", |
437 | recurse_to_leaf ? "_LEAF" : "", | 439 | recurse_to_leaf ? "_LEAF" : "", |
438 | bucket->id, x, outpos, numrep, | 440 | bucket->id, x, outpos, numrep, |
439 | tries, recurse_tries, local_retries, local_fallback_retries, | 441 | tries, recurse_tries, local_retries, local_fallback_retries, |
440 | parent_r); | 442 | parent_r, stable); |
441 | 443 | ||
442 | for (rep = outpos; rep < numrep && count > 0 ; rep++) { | 444 | for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) { |
443 | /* keep trying until we get a non-out, non-colliding item */ | 445 | /* keep trying until we get a non-out, non-colliding item */ |
444 | ftotal = 0; | 446 | ftotal = 0; |
445 | skip_rep = 0; | 447 | skip_rep = 0; |
@@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map, | |||
512 | if (crush_choose_firstn(map, | 514 | if (crush_choose_firstn(map, |
513 | map->buckets[-1-item], | 515 | map->buckets[-1-item], |
514 | weight, weight_max, | 516 | weight, weight_max, |
515 | x, outpos+1, 0, | 517 | x, stable ? 1 : outpos+1, 0, |
516 | out2, outpos, count, | 518 | out2, outpos, count, |
517 | recurse_tries, 0, | 519 | recurse_tries, 0, |
518 | local_retries, | 520 | local_retries, |
519 | local_fallback_retries, | 521 | local_fallback_retries, |
520 | 0, | 522 | 0, |
521 | vary_r, | 523 | vary_r, |
524 | stable, | ||
522 | NULL, | 525 | NULL, |
523 | sub_r) <= outpos) | 526 | sub_r) <= outpos) |
524 | /* didn't get leaf */ | 527 | /* didn't get leaf */ |
@@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map, | |||
816 | int choose_local_fallback_retries = map->choose_local_fallback_tries; | 819 | int choose_local_fallback_retries = map->choose_local_fallback_tries; |
817 | 820 | ||
818 | int vary_r = map->chooseleaf_vary_r; | 821 | int vary_r = map->chooseleaf_vary_r; |
822 | int stable = map->chooseleaf_stable; | ||
819 | 823 | ||
820 | if ((__u32)ruleno >= map->max_rules) { | 824 | if ((__u32)ruleno >= map->max_rules) { |
821 | dprintk(" bad ruleno %d\n", ruleno); | 825 | dprintk(" bad ruleno %d\n", ruleno); |
@@ -835,7 +839,8 @@ int crush_do_rule(const struct crush_map *map, | |||
835 | case CRUSH_RULE_TAKE: | 839 | case CRUSH_RULE_TAKE: |
836 | if ((curstep->arg1 >= 0 && | 840 | if ((curstep->arg1 >= 0 && |
837 | curstep->arg1 < map->max_devices) || | 841 | curstep->arg1 < map->max_devices) || |
838 | (-1-curstep->arg1 < map->max_buckets && | 842 | (-1-curstep->arg1 >= 0 && |
843 | -1-curstep->arg1 < map->max_buckets && | ||
839 | map->buckets[-1-curstep->arg1])) { | 844 | map->buckets[-1-curstep->arg1])) { |
840 | w[0] = curstep->arg1; | 845 | w[0] = curstep->arg1; |
841 | wsize = 1; | 846 | wsize = 1; |
@@ -869,6 +874,11 @@ int crush_do_rule(const struct crush_map *map, | |||
869 | vary_r = curstep->arg1; | 874 | vary_r = curstep->arg1; |
870 | break; | 875 | break; |
871 | 876 | ||
877 | case CRUSH_RULE_SET_CHOOSELEAF_STABLE: | ||
878 | if (curstep->arg1 >= 0) | ||
879 | stable = curstep->arg1; | ||
880 | break; | ||
881 | |||
872 | case CRUSH_RULE_CHOOSELEAF_FIRSTN: | 882 | case CRUSH_RULE_CHOOSELEAF_FIRSTN: |
873 | case CRUSH_RULE_CHOOSE_FIRSTN: | 883 | case CRUSH_RULE_CHOOSE_FIRSTN: |
874 | firstn = 1; | 884 | firstn = 1; |
@@ -888,6 +898,7 @@ int crush_do_rule(const struct crush_map *map, | |||
888 | osize = 0; | 898 | osize = 0; |
889 | 899 | ||
890 | for (i = 0; i < wsize; i++) { | 900 | for (i = 0; i < wsize; i++) { |
901 | int bno; | ||
891 | /* | 902 | /* |
892 | * see CRUSH_N, CRUSH_N_MINUS macros. | 903 | * see CRUSH_N, CRUSH_N_MINUS macros. |
893 | * basically, numrep <= 0 means relative to | 904 | * basically, numrep <= 0 means relative to |
@@ -900,6 +911,13 @@ int crush_do_rule(const struct crush_map *map, | |||
900 | continue; | 911 | continue; |
901 | } | 912 | } |
902 | j = 0; | 913 | j = 0; |
914 | /* make sure bucket id is valid */ | ||
915 | bno = -1 - w[i]; | ||
916 | if (bno < 0 || bno >= map->max_buckets) { | ||
917 | /* w[i] is probably CRUSH_ITEM_NONE */ | ||
918 | dprintk(" bad w[i] %d\n", w[i]); | ||
919 | continue; | ||
920 | } | ||
903 | if (firstn) { | 921 | if (firstn) { |
904 | int recurse_tries; | 922 | int recurse_tries; |
905 | if (choose_leaf_tries) | 923 | if (choose_leaf_tries) |
@@ -911,7 +929,7 @@ int crush_do_rule(const struct crush_map *map, | |||
911 | recurse_tries = choose_tries; | 929 | recurse_tries = choose_tries; |
912 | osize += crush_choose_firstn( | 930 | osize += crush_choose_firstn( |
913 | map, | 931 | map, |
914 | map->buckets[-1-w[i]], | 932 | map->buckets[bno], |
915 | weight, weight_max, | 933 | weight, weight_max, |
916 | x, numrep, | 934 | x, numrep, |
917 | curstep->arg2, | 935 | curstep->arg2, |
@@ -923,6 +941,7 @@ int crush_do_rule(const struct crush_map *map, | |||
923 | choose_local_fallback_retries, | 941 | choose_local_fallback_retries, |
924 | recurse_to_leaf, | 942 | recurse_to_leaf, |
925 | vary_r, | 943 | vary_r, |
944 | stable, | ||
926 | c+osize, | 945 | c+osize, |
927 | 0); | 946 | 0); |
928 | } else { | 947 | } else { |
@@ -930,7 +949,7 @@ int crush_do_rule(const struct crush_map *map, | |||
930 | numrep : (result_max-osize)); | 949 | numrep : (result_max-osize)); |
931 | crush_choose_indep( | 950 | crush_choose_indep( |
932 | map, | 951 | map, |
933 | map->buckets[-1-w[i]], | 952 | map->buckets[bno], |
934 | weight, weight_max, | 953 | weight, weight_max, |
935 | x, out_size, numrep, | 954 | x, out_size, numrep, |
936 | curstep->arg2, | 955 | curstep->arg2, |
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f8f235930d88..3534e12683d3 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c | |||
@@ -1770,6 +1770,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) | |||
1770 | u32 osdmap_epoch; | 1770 | u32 osdmap_epoch; |
1771 | int already_completed; | 1771 | int already_completed; |
1772 | u32 bytes; | 1772 | u32 bytes; |
1773 | u8 decode_redir; | ||
1773 | unsigned int i; | 1774 | unsigned int i; |
1774 | 1775 | ||
1775 | tid = le64_to_cpu(msg->hdr.tid); | 1776 | tid = le64_to_cpu(msg->hdr.tid); |
@@ -1841,6 +1842,15 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) | |||
1841 | p += 8 + 4; /* skip replay_version */ | 1842 | p += 8 + 4; /* skip replay_version */ |
1842 | p += 8; /* skip user_version */ | 1843 | p += 8; /* skip user_version */ |
1843 | 1844 | ||
1845 | if (le16_to_cpu(msg->hdr.version) >= 7) | ||
1846 | ceph_decode_8_safe(&p, end, decode_redir, bad_put); | ||
1847 | else | ||
1848 | decode_redir = 1; | ||
1849 | } else { | ||
1850 | decode_redir = 0; | ||
1851 | } | ||
1852 | |||
1853 | if (decode_redir) { | ||
1844 | err = ceph_redirect_decode(&p, end, &redir); | 1854 | err = ceph_redirect_decode(&p, end, &redir); |
1845 | if (err) | 1855 | if (err) |
1846 | goto bad_put; | 1856 | goto bad_put; |
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 7d8f581d9f1f..243574c8cf33 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c | |||
@@ -342,23 +342,32 @@ static struct crush_map *crush_decode(void *pbyval, void *end) | |||
342 | c->choose_local_tries = ceph_decode_32(p); | 342 | c->choose_local_tries = ceph_decode_32(p); |
343 | c->choose_local_fallback_tries = ceph_decode_32(p); | 343 | c->choose_local_fallback_tries = ceph_decode_32(p); |
344 | c->choose_total_tries = ceph_decode_32(p); | 344 | c->choose_total_tries = ceph_decode_32(p); |
345 | dout("crush decode tunable choose_local_tries = %d", | 345 | dout("crush decode tunable choose_local_tries = %d\n", |
346 | c->choose_local_tries); | 346 | c->choose_local_tries); |
347 | dout("crush decode tunable choose_local_fallback_tries = %d", | 347 | dout("crush decode tunable choose_local_fallback_tries = %d\n", |
348 | c->choose_local_fallback_tries); | 348 | c->choose_local_fallback_tries); |
349 | dout("crush decode tunable choose_total_tries = %d", | 349 | dout("crush decode tunable choose_total_tries = %d\n", |
350 | c->choose_total_tries); | 350 | c->choose_total_tries); |
351 | 351 | ||
352 | ceph_decode_need(p, end, sizeof(u32), done); | 352 | ceph_decode_need(p, end, sizeof(u32), done); |
353 | c->chooseleaf_descend_once = ceph_decode_32(p); | 353 | c->chooseleaf_descend_once = ceph_decode_32(p); |
354 | dout("crush decode tunable chooseleaf_descend_once = %d", | 354 | dout("crush decode tunable chooseleaf_descend_once = %d\n", |
355 | c->chooseleaf_descend_once); | 355 | c->chooseleaf_descend_once); |
356 | 356 | ||
357 | ceph_decode_need(p, end, sizeof(u8), done); | 357 | ceph_decode_need(p, end, sizeof(u8), done); |
358 | c->chooseleaf_vary_r = ceph_decode_8(p); | 358 | c->chooseleaf_vary_r = ceph_decode_8(p); |
359 | dout("crush decode tunable chooseleaf_vary_r = %d", | 359 | dout("crush decode tunable chooseleaf_vary_r = %d\n", |
360 | c->chooseleaf_vary_r); | 360 | c->chooseleaf_vary_r); |
361 | 361 | ||
362 | /* skip straw_calc_version, allowed_bucket_algs */ | ||
363 | ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done); | ||
364 | *p += sizeof(u8) + sizeof(u32); | ||
365 | |||
366 | ceph_decode_need(p, end, sizeof(u8), done); | ||
367 | c->chooseleaf_stable = ceph_decode_8(p); | ||
368 | dout("crush decode tunable chooseleaf_stable = %d\n", | ||
369 | c->chooseleaf_stable); | ||
370 | |||
362 | done: | 371 | done: |
363 | dout("crush_decode success\n"); | 372 | dout("crush_decode success\n"); |
364 | return c; | 373 | return c; |