aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-02-26 13:39:09 -0500
committerSage Weil <sage@inktank.com>2013-02-26 18:03:06 -0500
commit83ca14fdd35821554058e5fd4fa7b118ee504a33 (patch)
tree8f105c71b91854c68db4281f255723356e709ccd /net
parent1b83bef24c6746a146d39915a18fb5425f2facb0 (diff)
libceph: add support for HASHPSPOOL pool flag
The legacy behavior adds the pgid seed and pool together as the input for CRUSH. That is problematic because each pool's PGs end up mapping to the same OSDs: 1.5 == 2.4 == 3.3 == ... Instead, if the HASHPSPOOL flag is set, we has the ps and pool together and feed that into CRUSH. This ensures that two adjacent pools will map to an independent pseudorandom set of OSDs. Advertise our support for this via a protocol feature flag. Signed-off-by: Sage Weil <sage@inktank.com> Reviewed-by: Alex Elder <elder@inktank.com>
Diffstat (limited to 'net')
-rw-r--r--net/ceph/osdmap.c39
1 files changed, 26 insertions, 13 deletions
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 378471644501..69bc4bf89e3e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1127,18 +1127,16 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1127 struct ceph_pg_mapping *pg; 1127 struct ceph_pg_mapping *pg;
1128 struct ceph_pg_pool_info *pool; 1128 struct ceph_pg_pool_info *pool;
1129 int ruleno; 1129 int ruleno;
1130 unsigned int poolid, ps, pps, t, r; 1130 int r;
1131 u32 pps;
1131 1132
1132 poolid = pgid.pool; 1133 pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
1133 ps = pgid.seed;
1134
1135 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1136 if (!pool) 1134 if (!pool)
1137 return NULL; 1135 return NULL;
1138 1136
1139 /* pg_temp? */ 1137 /* pg_temp? */
1140 t = ceph_stable_mod(ps, pool->pg_num, pool->pgp_num_mask); 1138 pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
1141 pgid.seed = t; 1139 pool->pgp_num_mask);
1142 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 1140 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1143 if (pg) { 1141 if (pg) {
1144 *num = pg->len; 1142 *num = pg->len;
@@ -1149,20 +1147,35 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1149 ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, 1147 ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
1150 pool->type, pool->size); 1148 pool->type, pool->size);
1151 if (ruleno < 0) { 1149 if (ruleno < 0) {
1152 pr_err("no crush rule pool %d ruleset %d type %d size %d\n", 1150 pr_err("no crush rule pool %lld ruleset %d type %d size %d\n",
1153 poolid, pool->crush_ruleset, pool->type, 1151 pgid.pool, pool->crush_ruleset, pool->type,
1154 pool->size); 1152 pool->size);
1155 return NULL; 1153 return NULL;
1156 } 1154 }
1157 1155
1158 pps = ceph_stable_mod(ps, pool->pgp_num, pool->pgp_num_mask); 1156 if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
1159 pps += poolid; 1157 /* hash pool id and seed sothat pool PGs do not overlap */
1158 pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
1159 ceph_stable_mod(pgid.seed, pool->pgp_num,
1160 pool->pgp_num_mask),
1161 pgid.pool);
1162 } else {
1163 /*
1164 * legacy ehavior: add ps and pool together. this is
1165 * not a great approach because the PGs from each pool
1166 * will overlap on top of each other: 0.5 == 1.4 ==
1167 * 2.3 == ...
1168 */
1169 pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
1170 pool->pgp_num_mask) +
1171 (unsigned)pgid.pool;
1172 }
1160 r = crush_do_rule(osdmap->crush, ruleno, pps, osds, 1173 r = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1161 min_t(int, pool->size, *num), 1174 min_t(int, pool->size, *num),
1162 osdmap->osd_weight); 1175 osdmap->osd_weight);
1163 if (r < 0) { 1176 if (r < 0) {
1164 pr_err("error %d from crush rule: pool %d ruleset %d type %d" 1177 pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
1165 " size %d\n", r, poolid, pool->crush_ruleset, 1178 " size %d\n", r, pgid.pool, pool->crush_ruleset,
1166 pool->type, pool->size); 1179 pool->type, pool->size);
1167 return NULL; 1180 return NULL;
1168 } 1181 }