aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-02-26 13:39:09 -0500
committerSage Weil <sage@inktank.com>2013-02-26 18:03:06 -0500
commit83ca14fdd35821554058e5fd4fa7b118ee504a33 (patch)
tree8f105c71b91854c68db4281f255723356e709ccd
parent1b83bef24c6746a146d39915a18fb5425f2facb0 (diff)
libceph: add support for HASHPSPOOL pool flag
The legacy behavior adds the pgid seed and pool together as the input for CRUSH. That is problematic because each pool's PGs end up mapping to the same OSDs: 1.5 == 2.4 == 3.3 == ... Instead, if the HASHPSPOOL flag is set, we has the ps and pool together and feed that into CRUSH. This ensures that two adjacent pools will map to an independent pseudorandom set of OSDs. Advertise our support for this via a protocol feature flag. Signed-off-by: Sage Weil <sage@inktank.com> Reviewed-by: Alex Elder <elder@inktank.com>
-rw-r--r--include/linux/ceph/ceph_features.h4
-rw-r--r--include/linux/ceph/osdmap.h2
-rw-r--r--net/ceph/osdmap.c39
3 files changed, 31 insertions, 14 deletions
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index ab0a54286e0d..76554cecaab2 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -34,6 +34,7 @@
34#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) 34#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
35#define CEPH_FEATURE_OSD_HBMSGS (1<<28) 35#define CEPH_FEATURE_OSD_HBMSGS (1<<28)
36#define CEPH_FEATURE_MDSENC (1<<29) 36#define CEPH_FEATURE_MDSENC (1<<29)
37#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30)
37 38
38/* 39/*
39 * Features supported. 40 * Features supported.
@@ -45,7 +46,8 @@
45 CEPH_FEATURE_OSDENC | \ 46 CEPH_FEATURE_OSDENC | \
46 CEPH_FEATURE_CRUSH_TUNABLES | \ 47 CEPH_FEATURE_CRUSH_TUNABLES | \
47 CEPH_FEATURE_CRUSH_TUNABLES2 | \ 48 CEPH_FEATURE_CRUSH_TUNABLES2 | \
48 CEPH_FEATURE_REPLY_CREATE_INODE) 49 CEPH_FEATURE_REPLY_CREATE_INODE | \
50 CEPH_FEATURE_OSDHASHPSPOOL)
49 51
50#define CEPH_FEATURES_REQUIRED_DEFAULT \ 52#define CEPH_FEATURES_REQUIRED_DEFAULT \
51 (CEPH_FEATURE_NOSRCADDR | \ 53 (CEPH_FEATURE_NOSRCADDR | \
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 35985125f118..c819190d1642 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -23,6 +23,8 @@ struct ceph_pg {
23 uint32_t seed; 23 uint32_t seed;
24}; 24};
25 25
26#define CEPH_POOL_FLAG_HASHPSPOOL 1
27
26struct ceph_pg_pool_info { 28struct ceph_pg_pool_info {
27 struct rb_node node; 29 struct rb_node node;
28 s64 id; 30 s64 id;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 378471644501..69bc4bf89e3e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1127,18 +1127,16 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1127 struct ceph_pg_mapping *pg; 1127 struct ceph_pg_mapping *pg;
1128 struct ceph_pg_pool_info *pool; 1128 struct ceph_pg_pool_info *pool;
1129 int ruleno; 1129 int ruleno;
1130 unsigned int poolid, ps, pps, t, r; 1130 int r;
1131 u32 pps;
1131 1132
1132 poolid = pgid.pool; 1133 pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
1133 ps = pgid.seed;
1134
1135 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1136 if (!pool) 1134 if (!pool)
1137 return NULL; 1135 return NULL;
1138 1136
1139 /* pg_temp? */ 1137 /* pg_temp? */
1140 t = ceph_stable_mod(ps, pool->pg_num, pool->pgp_num_mask); 1138 pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
1141 pgid.seed = t; 1139 pool->pgp_num_mask);
1142 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 1140 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1143 if (pg) { 1141 if (pg) {
1144 *num = pg->len; 1142 *num = pg->len;
@@ -1149,20 +1147,35 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1149 ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, 1147 ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
1150 pool->type, pool->size); 1148 pool->type, pool->size);
1151 if (ruleno < 0) { 1149 if (ruleno < 0) {
1152 pr_err("no crush rule pool %d ruleset %d type %d size %d\n", 1150 pr_err("no crush rule pool %lld ruleset %d type %d size %d\n",
1153 poolid, pool->crush_ruleset, pool->type, 1151 pgid.pool, pool->crush_ruleset, pool->type,
1154 pool->size); 1152 pool->size);
1155 return NULL; 1153 return NULL;
1156 } 1154 }
1157 1155
1158 pps = ceph_stable_mod(ps, pool->pgp_num, pool->pgp_num_mask); 1156 if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
1159 pps += poolid; 1157 /* hash pool id and seed sothat pool PGs do not overlap */
1158 pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
1159 ceph_stable_mod(pgid.seed, pool->pgp_num,
1160 pool->pgp_num_mask),
1161 pgid.pool);
1162 } else {
1163 /*
1164 * legacy ehavior: add ps and pool together. this is
1165 * not a great approach because the PGs from each pool
1166 * will overlap on top of each other: 0.5 == 1.4 ==
1167 * 2.3 == ...
1168 */
1169 pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
1170 pool->pgp_num_mask) +
1171 (unsigned)pgid.pool;
1172 }
1160 r = crush_do_rule(osdmap->crush, ruleno, pps, osds, 1173 r = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1161 min_t(int, pool->size, *num), 1174 min_t(int, pool->size, *num),
1162 osdmap->osd_weight); 1175 osdmap->osd_weight);
1163 if (r < 0) { 1176 if (r < 0) {
1164 pr_err("error %d from crush rule: pool %d ruleset %d type %d" 1177 pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
1165 " size %d\n", r, poolid, pool->crush_ruleset, 1178 " size %d\n", r, pgid.pool, pool->crush_ruleset,
1166 pool->type, pool->size); 1179 pool->type, pool->size);
1167 return NULL; 1180 return NULL;
1168 } 1181 }