diff options
author | Ilya Dryomov <idryomov@gmail.com> | 2016-07-18 21:50:28 -0400 |
---|---|---|
committer | Ilya Dryomov <idryomov@gmail.com> | 2016-07-22 09:17:40 -0400 |
commit | 930c532869774ebf8af9efe9484c597f896a7d46 (patch) | |
tree | 68072d2a5c42dbf347514b8737f1abd0df044966 | |
parent | 92d21ac74a9e3c09b0b01c764e530657e4c85c49 (diff) |
libceph: apply new_state before new_up_client on incrementals
Currently, osd_weight and osd_state fields are updated in the encoding
order. This is wrong, because an incremental map may look like e.g.
new_up_client: { osd=6, addr=... } # set osd_state and addr
new_state: { osd=6, xorstate=EXISTS } # clear osd_state
Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After
applying new_up_client, osd_state is changed to EXISTS | UP. Carrying
on with the new_state update, we flip EXISTS and leave osd6 in a weird
"!EXISTS but UP" state. A non-existent OSD is considered down by the
mapping code
2087 for (i = 0; i < pg->pg_temp.len; i++) {
2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
2089 if (ceph_can_shift_osds(pi))
2090 continue;
2091
2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE;
and so requests get directed to the second OSD in the set instead of
the first, resulting in OSD-side errors like:
[WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680
and hung rbds on the client:
[ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0)
[ 493.566805] rbd: rbd0: result -6 xferred 400000
[ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688
The fix is to decouple application from the decoding and:
- apply new_weight first
- apply new_state before new_up_client
- twiddle osd_state flags if marking in
- clear out some of the state if osd is destroyed
Fixes: http://tracker.ceph.com/issues/14901
Cc: stable@vger.kernel.org # 3.15+: 6dd74e44dc1d: libceph: set 'exists' flag for newly up osd
Cc: stable@vger.kernel.org # 3.15+
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Josh Durgin <jdurgin@redhat.com>
-rw-r--r-- | net/ceph/osdmap.c | 156 |
1 files changed, 113 insertions, 43 deletions
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 03062bb763b3..7e480bf75bcf 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c | |||
@@ -1261,6 +1261,115 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) | |||
1261 | } | 1261 | } |
1262 | 1262 | ||
1263 | /* | 1263 | /* |
1264 | * Encoding order is (new_up_client, new_state, new_weight). Need to | ||
1265 | * apply in the (new_weight, new_state, new_up_client) order, because | ||
1266 | * an incremental map may look like e.g. | ||
1267 | * | ||
1268 | * new_up_client: { osd=6, addr=... } # set osd_state and addr | ||
1269 | * new_state: { osd=6, xorstate=EXISTS } # clear osd_state | ||
1270 | */ | ||
1271 | static int decode_new_up_state_weight(void **p, void *end, | ||
1272 | struct ceph_osdmap *map) | ||
1273 | { | ||
1274 | void *new_up_client; | ||
1275 | void *new_state; | ||
1276 | void *new_weight_end; | ||
1277 | u32 len; | ||
1278 | |||
1279 | new_up_client = *p; | ||
1280 | ceph_decode_32_safe(p, end, len, e_inval); | ||
1281 | len *= sizeof(u32) + sizeof(struct ceph_entity_addr); | ||
1282 | ceph_decode_need(p, end, len, e_inval); | ||
1283 | *p += len; | ||
1284 | |||
1285 | new_state = *p; | ||
1286 | ceph_decode_32_safe(p, end, len, e_inval); | ||
1287 | len *= sizeof(u32) + sizeof(u8); | ||
1288 | ceph_decode_need(p, end, len, e_inval); | ||
1289 | *p += len; | ||
1290 | |||
1291 | /* new_weight */ | ||
1292 | ceph_decode_32_safe(p, end, len, e_inval); | ||
1293 | while (len--) { | ||
1294 | s32 osd; | ||
1295 | u32 w; | ||
1296 | |||
1297 | ceph_decode_need(p, end, 2*sizeof(u32), e_inval); | ||
1298 | osd = ceph_decode_32(p); | ||
1299 | w = ceph_decode_32(p); | ||
1300 | BUG_ON(osd >= map->max_osd); | ||
1301 | pr_info("osd%d weight 0x%x %s\n", osd, w, | ||
1302 | w == CEPH_OSD_IN ? "(in)" : | ||
1303 | (w == CEPH_OSD_OUT ? "(out)" : "")); | ||
1304 | map->osd_weight[osd] = w; | ||
1305 | |||
1306 | /* | ||
1307 | * If we are marking in, set the EXISTS, and clear the | ||
1308 | * AUTOOUT and NEW bits. | ||
1309 | */ | ||
1310 | if (w) { | ||
1311 | map->osd_state[osd] |= CEPH_OSD_EXISTS; | ||
1312 | map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT | | ||
1313 | CEPH_OSD_NEW); | ||
1314 | } | ||
1315 | } | ||
1316 | new_weight_end = *p; | ||
1317 | |||
1318 | /* new_state (up/down) */ | ||
1319 | *p = new_state; | ||
1320 | len = ceph_decode_32(p); | ||
1321 | while (len--) { | ||
1322 | s32 osd; | ||
1323 | u8 xorstate; | ||
1324 | int ret; | ||
1325 | |||
1326 | osd = ceph_decode_32(p); | ||
1327 | xorstate = ceph_decode_8(p); | ||
1328 | if (xorstate == 0) | ||
1329 | xorstate = CEPH_OSD_UP; | ||
1330 | BUG_ON(osd >= map->max_osd); | ||
1331 | if ((map->osd_state[osd] & CEPH_OSD_UP) && | ||
1332 | (xorstate & CEPH_OSD_UP)) | ||
1333 | pr_info("osd%d down\n", osd); | ||
1334 | if ((map->osd_state[osd] & CEPH_OSD_EXISTS) && | ||
1335 | (xorstate & CEPH_OSD_EXISTS)) { | ||
1336 | pr_info("osd%d does not exist\n", osd); | ||
1337 | map->osd_weight[osd] = CEPH_OSD_IN; | ||
1338 | ret = set_primary_affinity(map, osd, | ||
1339 | CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); | ||
1340 | if (ret) | ||
1341 | return ret; | ||
1342 | memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr)); | ||
1343 | map->osd_state[osd] = 0; | ||
1344 | } else { | ||
1345 | map->osd_state[osd] ^= xorstate; | ||
1346 | } | ||
1347 | } | ||
1348 | |||
1349 | /* new_up_client */ | ||
1350 | *p = new_up_client; | ||
1351 | len = ceph_decode_32(p); | ||
1352 | while (len--) { | ||
1353 | s32 osd; | ||
1354 | struct ceph_entity_addr addr; | ||
1355 | |||
1356 | osd = ceph_decode_32(p); | ||
1357 | ceph_decode_copy(p, &addr, sizeof(addr)); | ||
1358 | ceph_decode_addr(&addr); | ||
1359 | BUG_ON(osd >= map->max_osd); | ||
1360 | pr_info("osd%d up\n", osd); | ||
1361 | map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; | ||
1362 | map->osd_addr[osd] = addr; | ||
1363 | } | ||
1364 | |||
1365 | *p = new_weight_end; | ||
1366 | return 0; | ||
1367 | |||
1368 | e_inval: | ||
1369 | return -EINVAL; | ||
1370 | } | ||
1371 | |||
1372 | /* | ||
1264 | * decode and apply an incremental map update. | 1373 | * decode and apply an incremental map update. |
1265 | */ | 1374 | */ |
1266 | struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | 1375 | struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, |
@@ -1358,49 +1467,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
1358 | __remove_pg_pool(&map->pg_pools, pi); | 1467 | __remove_pg_pool(&map->pg_pools, pi); |
1359 | } | 1468 | } |
1360 | 1469 | ||
1361 | /* new_up */ | 1470 | /* new_up_client, new_state, new_weight */ |
1362 | ceph_decode_32_safe(p, end, len, e_inval); | 1471 | err = decode_new_up_state_weight(p, end, map); |
1363 | while (len--) { | 1472 | if (err) |
1364 | u32 osd; | 1473 | goto bad; |
1365 | struct ceph_entity_addr addr; | ||
1366 | ceph_decode_32_safe(p, end, osd, e_inval); | ||
1367 | ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval); | ||
1368 | ceph_decode_addr(&addr); | ||
1369 | pr_info("osd%d up\n", osd); | ||
1370 | BUG_ON(osd >= map->max_osd); | ||
1371 | map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS; | ||
1372 | map->osd_addr[osd] = addr; | ||
1373 | } | ||
1374 | |||
1375 | /* new_state */ | ||
1376 | ceph_decode_32_safe(p, end, len, e_inval); | ||
1377 | while (len--) { | ||
1378 | u32 osd; | ||
1379 | u8 xorstate; | ||
1380 | ceph_decode_32_safe(p, end, osd, e_inval); | ||
1381 | xorstate = **(u8 **)p; | ||
1382 | (*p)++; /* clean flag */ | ||
1383 | if (xorstate == 0) | ||
1384 | xorstate = CEPH_OSD_UP; | ||
1385 | if (xorstate & CEPH_OSD_UP) | ||
1386 | pr_info("osd%d down\n", osd); | ||
1387 | if (osd < map->max_osd) | ||
1388 | map->osd_state[osd] ^= xorstate; | ||
1389 | } | ||
1390 | |||
1391 | /* new_weight */ | ||
1392 | ceph_decode_32_safe(p, end, len, e_inval); | ||
1393 | while (len--) { | ||
1394 | u32 osd, off; | ||
1395 | ceph_decode_need(p, end, sizeof(u32)*2, e_inval); | ||
1396 | osd = ceph_decode_32(p); | ||
1397 | off = ceph_decode_32(p); | ||
1398 | pr_info("osd%d weight 0x%x %s\n", osd, off, | ||
1399 | off == CEPH_OSD_IN ? "(in)" : | ||
1400 | (off == CEPH_OSD_OUT ? "(out)" : "")); | ||
1401 | if (osd < map->max_osd) | ||
1402 | map->osd_weight[osd] = off; | ||
1403 | } | ||
1404 | 1474 | ||
1405 | /* new_pg_temp */ | 1475 | /* new_pg_temp */ |
1406 | err = decode_new_pg_temp(p, end, map); | 1476 | err = decode_new_pg_temp(p, end, map); |